If the RDS objects have already been created in a previous run, the following cell can be uncommented and run, and the remaining cells in the data import section can be skipped. See RDS object section).
# data_list <- list.files(path = here("results-refseq/r_objects"),
# pattern = ".RDS",
# recursive = TRUE, full.names = TRUE) %>%
# set_names(., str_remove(basename(.), ".RDS")) %>%
# purrr::imap(~ readRDS(file = .x ))
#
# list2env(data_list, .GlobalEnv)
# rm(data_list)
# read in fastq screen files
fastq_screen <- list.files(
path = here("results-refseq/fastq-screen"), pattern = "_screen.txt",
recursive = TRUE, full.names = TRUE
) %>%
read_delim(skip = 1, delim = "\t", n_max = 2, id = "read", show_col_types = FALSE) %>%
rename("#Multiple_hits_multiple_genomes" = Multiple_hits_multiple_genomes) %>%
mutate(read = str_remove(basename(read), "_screen.txt")) %>%
mutate(sample = as.factor(str_remove(read, "_R[1,2].*"))) %>%
mutate(Genome = as.factor(Genome))
# combine R1 and R2 stats per species and recalculate percentages
fastq_screen <- fastq_screen %>%
select(!starts_with("%")) %>%
group_by(sample, Genome) %>%
summarise(across(.cols = `#Reads_processed`:`#Multiple_hits_multiple_genomes`, .fns = ~ sum(.x)), .groups = "drop_last") %>%
mutate(across(.cols = `#Unmapped`:`#Multiple_hits_multiple_genomes`, .fns = ~ .x / `#Reads_processed`, .names = "%_{.col}")) %>%
rename_with(~ str_replace(.x, "#", ""), .cols = starts_with("%"))
# read in samtools flagstat and metadata files
mapping_stats_df <- read_delim((here("results-refseq/mapping_stats.csv")), col_types = list(sample = "factor")) %>%
mutate(primary_mapped_human = primary_mapped_all - primary_mapped_pk)
# merge dataframes
fastq_screen_mapping_stats_df <- mapping_stats_df %>%
left_join(fastq_screen, by = "sample")
# Picard PCR duplicates
pcr_dups <- read_delim(here("results-refseq/multiqc/star_salmon/multiqc_report_data/multiqc_picard_dups.txt"), delim = "\t", col_types = list(Sample = "factor"))
# STAR mapping statistics
star_stats <- read_delim(here("results-refseq/multiqc/star_salmon/multiqc_report_data/mqc_star_alignment_plot_1.txt"), delim = "\t", col_types = list(Sample = "factor"))
salmon gene counts and tximport gene counts/TPM
abundancesImport the following sets of count files:
nf-core/rnaseq pipeline
through tximport of Salmon transcript-level counts )nf-core/rnaseq pipeline
through tximport of Salmon transcript-level counts )quant.sf files via
tximport. Values should correspond exactly to the other
gene-level counts). Used for visualisation.Background info on the different types of counts and how to import them:
salmonoutputs two types of counts: transcript-level (quant.sf) and gene-level (quant.genes.sf). Both offer feature length, effective length, TPM, and number of reads. However, when importing counts withtximportfor downstream differential expression analysis, transcript-level is preferred, becausetximportcan perform gene-level aggregation across all samples, instead of the per-sample aggregation that is done for thequant.genes.sffiles. See: https://github.com/COMBINE-lab/salmon/issues/437.
For
tximport, there are several different ways of creating count matrices (i.e., using raw counts with length offset or by scaling the TPM abundances), but this should only be relevant for downstream differential expression testing) See the description of different options provided by the nf-core/rnaseq pipeline: https://nf-co.re/rnaseq/3.14.0/docs/output/#pseudoalignment and thetximportdocs here: https://bioconductor.org/packages/release/bioc/vignettes/tximport/inst/doc/tximport.html.
The Salmon transcript-level quant files can be imported using tximport and then converted into a DESeq object (~SummarizedExperiment). This approach provides the same count matrix as the tximport gene counts csv file, but it enables the DESeq2 approach for visualisation, namely transforming the counts using
vst(variance stabilizing transformation) orrlogin order to remove the dependence of the variance on the mean. See DESeq2 vignette.
# read in sample metadata
samples <- read_csv(here("./data/samplesheet.csv"),
col_select = c("sample", "condition", "RNA_type", "library_kit", "removal", "WBC_depletion", "kit"),
col_types = cols(.default = "f")
) %>%
mutate(RNA_type = str_to_title(RNA_type)) %>%
mutate(filter_name = factor(WBC_depletion,
levels = c("nofilter", "centpip", "plasmo", "pmacs", "cellulose"),
labels = c("no filter", "cent/pip", "plasmodipur", "PMACS", "cellulose")
)) %>%
# mutate(kit = as.factor(str_replace_all(kit, "_", " "))) %>%
mutate(kit_name = factor(kit,
levels = c("mRNA_qiagen_FSglob", "mRNA_qiagen_FSglobH", "tRNA_qiagen_FSglobH", "mRNA_illumina"),
labels = c("mRNA Qiagen FS globin", "mRNA Qiagen FS globin/rRNA", "tRNA Qiagen FS globin/rRNA", "mRNA Illumina")
)) %>%
mutate(sample_name = as.factor(str_replace(str_replace(sample, "104974-001-0", "Sample "), "_S1_L001", paste(" -", kit_name, "-", filter_name)))) %>%
mutate(sample_short = as.factor(str_replace(str_replace(sample, "104974-001-0", "Sample "), "_S1_L001", "")))
# read in transcript info - transcript id, gene id and genes names
transcript_info <- read.csv(here("results-refseq/star_salmon/tx2gene.tsv"),
sep = "\t", header = FALSE,
col.names = c("tx", "gene_id", "gene_name")
)
transcript_info <- list(
transcript = transcript_info,
gene = unique(transcript_info[, 2:3]),
tx2gene = transcript_info[, 1:2]
)
# txi gene-level counts - same as using tximport and DESeqDataSetFromTximport except rounding needs to be done manually
# gene-level aggregation across all samples
gene_counts_txi <- read_delim(here("results-refseq/star_salmon/salmon.merged.gene_counts.tsv")) %>%
mutate(across(starts_with("X"), \(x) as.integer(round(x)))) %>%
rename_with(~ str_replace_all(.x, pattern = "\\.", replacement = "-"), .cols = `X104974.001.001_S1_L001`:`X104974.001.016_S1_L001`) %>%
rename_with(~ str_sub(.x, 2, -1), .cols = `X104974-001-001_S1_L001`:`X104974-001-016_S1_L001`)
# txi gene-level TPM abundances
gene_tpm <- read_delim(here("results-refseq/star_salmon/salmon.merged.gene_tpm.tsv")) %>%
rename_with(~ str_replace_all(.x, pattern = "\\.", replacement = "-"), .cols = `X104974.001.001_S1_L001`:`X104974.001.016_S1_L001`) %>%
rename_with(~ str_sub(.x, 2, -1), .cols = `X104974-001-001_S1_L001`:`X104974-001-016_S1_L001`)
# DESeqDataSetFromTximport - original counts + offset tximport
# to be used with DESeq2's vst/rlog transform for visualisations
salmon_quant_files <- paste0("./results-refseq/star_salmon/", list.files(path = "./results-refseq/star_salmon/", pattern = "quant.sf", recursive = TRUE))
txi <- tximport(salmon_quant_files, type = "salmon", tx2gene = transcript_info$tx2gene, txOut = FALSE)
gse <- DESeqDataSetFromTximport(txi, colData = samples, design = ~1)
# clean up
rm(txi)
gff file.gene_biotype info for P. knowlesi genes
(protein coding and r/t/nc/sn/snoRNA).# Note: not all Pk genes start with the prefix PKNH, so it is safer to read genes from GFF instead
gff <- read_delim(here("data/ref-refseq-refseq/GCF_000006355.2_GCA_000006355.2_genomic.gff.gz"), comment = "#", col_names = c("seqname", "source", "feature", "start", "end", "score", "strand", "frame", "attribute")) %>%
# only keep genes, since we use aggregated gene counts
filter(feature == "gene") %>%
mutate(id = row_number()) %>%
separate_rows(attribute, sep = ";") %>%
separate(attribute, c("attribute", "attribute_value"), sep = "=") %>%
pivot_wider(names_from = attribute, values_from = attribute_value) %>%
select(c(gene_biotype, ID)) %>%
rename(gene_id = ID) %>%
mutate(species = "pk")
# check to make sure there are no genes in gff that do not appear in transcript info file
stopifnot(all(gff$gene_id %in% transcript_info$gene$gene_id))
# create gene annotation df (based on one of the count matrices, not tx2gene
# because the latter contains more features, namely pseudogenes)
gene_annotation <- gene_counts_txi %>%
select(c(gene_id, gene_name)) %>%
left_join(gff, by = "gene_id") %>%
mutate(species = case_when(is.na(species) ~ "human", .default = species))
# check to make sure all genes were assigned a species
stopifnot(!any(is.na(gene_annotation$species)))
# naive check to make sure "most" Pk genes are assigned the correct species
stopifnot((gene_annotation %>% filter(str_detect(gene_id, "PKNH")) %>% select(species) %>% unique()) == "pk")
The RefSeq gff files contains info on rRNA
in the gene_biotype attribute of gene entries (including
mitochondrial rRNA) for both species (i.e., we collected featured
labelled as gene in the 3rd column of the gff, not rRNA
directly, since counts were aggregated on the gene level).
NC_011902.2 RefSeq gene 825471 825594 . + . ID=gene-PKNH_0117700;Dbxref=GeneID:7318405;Name=PKNH_0117700;end_range=825594,.;gbkey=Gene;gene_biotype=rRNA;locus_tag=PKNH_0117700;old_locus_tag=PKH_011712;partial=true;start_range=.,825471
NC_011902.2 RefSeq rRNA 825471 825594 . + . ID=rna-XR_002461722.1;Parent=gene-PKNH_0117700;Dbxref=GeneID:7318405,GenBank:XR_002461722.1;Name=XR_002461722.1;end_range=825594,.;gbkey=rRNA;locus_tag=PKNH_0117700;orig_transcript_id=gnl|WGS:CADCXE|mrna.PKNH_0117700-RA;partial=true;product=5.8S ribosomal RNA;start_range=.,825471;transcript_id=XR_002461722.1
NC_011902.2 RefSeq exon 825471 825594 . + . ID=exon-XR_002461722.1-1;Parent=rna-XR_002461722.1;Dbxref=GeneID:7318405,GenBank:XR_002461722.1;end_range=825594,.;gbkey=rRNA;locus_tag=PKNH_0117700;orig_transcript_id=gnl|WGS:CADCXE|mrna.PKNH_0117700-RA;partial=true;product=5.8S ribosomal RNA;start_range=.,825471;transcript_id=XR_002461722.1
The bash script in scripts/2_expression_analysis/0_gene_annotations-refseq.sh
was used to retrieve the rRNA genes from both annotation files.
# zcat "${ref_human}" | awk '$3 ~ /gene/' | cut -f9 | grep -E "gene_biotype=rRNA" | grep -v "rRNA_pseudo" | grep -o "ID=[^;]*" | sed 's/ID=/human_rRNA;/' >> "${output_file}"
# zcat "${ref_pk}" | cut -f9 | grep -E "gene_biotype=rRNA" | grep -o "ID=[^;]*" | sed 's/ID=/pk_rRNA;/' >> "${output_file}"
# read in gene type table
gene_types_df <- read_csv2(here("data/ref-refseq-refseq/gene_types.csv")) %>% left_join(transcript_info$gene, by = "gene_id")
Note that the GENCODE and RefSeq annotation file contain different subsets of annotated rRNA genes! See semi-comprehensive list of human rRNA genes here: https://www.genenames.org/data/genegroup/#!/group/848. Some are not included in any reference assembly, but others appear to be exclusive to either GENCODE or RefSeq. Some examples include RNA18S and 28S being present in RefSeq only. Moreover, for GENCODE, the majority of rRNA counts appear to be zero across the board.
See tables below for a full breakdown:
knitr::kable(
gene_types_df %>%
filter((gene_type == "human_rRNA") | (gene_type == "Mt_rRNA")) %>%
left_join(gene_tpm, by = c("gene_id", "gene_name")) %>%
filter(rowSums(across(.cols = `104974-001-001_S1_L001`:`104974-001-016_S1_L001`)) != 0),
caption = "Non-zero counts of human rRNA genes annotated in RefSeq"
)
| gene_type | gene_id | gene_name | 104974-001-001_S1_L001 | 104974-001-002_S1_L001 | 104974-001-003_S1_L001 | 104974-001-004_S1_L001 | 104974-001-005_S1_L001 | 104974-001-006_S1_L001 | 104974-001-007_S1_L001 | 104974-001-008_S1_L001 | 104974-001-009_S1_L001 | 104974-001-010_S1_L001 | 104974-001-011_S1_L001 | 104974-001-012_S1_L001 | 104974-001-013_S1_L001 | 104974-001-014_S1_L001 | 104974-001-015_S1_L001 | 104974-001-016_S1_L001 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| human_rRNA | gene-RNA5S1 | RNA5S1 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.00000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.119986 | 0.000000 | 0.156253 | 0.000000 |
| human_rRNA | gene-RNA5S2 | RNA5S2 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.00000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.119986 | 0.000000 | 0.156253 | 0.000000 |
| human_rRNA | gene-RNA5S3 | RNA5S3 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.00000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.119986 | 0.000000 | 0.156253 | 0.000000 |
| human_rRNA | gene-RNA5S4 | RNA5S4 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.00000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.119986 | 0.000000 | 0.156253 | 0.000000 |
| human_rRNA | gene-RNA5S5 | RNA5S5 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.00000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.119986 | 0.000000 | 0.156253 | 0.000000 |
| human_rRNA | gene-RNA5S6 | RNA5S6 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.00000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.119986 | 0.000000 | 0.156253 | 0.000000 |
| human_rRNA | gene-RNA5S7 | RNA5S7 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.00000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.119986 | 0.000000 | 0.156253 | 0.000000 |
| human_rRNA | gene-RNA5S8 | RNA5S8 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.00000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.119986 | 0.000000 | 0.156253 | 0.000000 |
| human_rRNA | gene-RNA5S9 | RNA5S9 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 1.91644 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 1.709899 | 0.500011 | 1.224046 |
| human_rRNA | gene-RNA5S10 | RNA5S10 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.00000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.119986 | 0.000000 | 0.156253 | 0.000000 |
| human_rRNA | gene-RNA5S11 | RNA5S11 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.00000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.119986 | 0.000000 | 0.156253 | 0.000000 |
| human_rRNA | gene-RNA5S12 | RNA5S12 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.00000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.119986 | 0.000000 | 0.156253 | 4.896184 |
| human_rRNA | gene-RNA5S13 | RNA5S13 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.00000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.119986 | 0.000000 | 0.156253 | 0.000000 |
| human_rRNA | gene-RNA5S14 | RNA5S14 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.00000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.119986 | 2.564849 | 0.156253 | 0.000000 |
| human_rRNA | gene-RNA5S15 | RNA5S15 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.00000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.119986 | 0.000000 | 0.156253 | 0.000000 |
| human_rRNA | gene-RNA5S16 | RNA5S16 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.00000 | 0.000000 | 0.00000 | 0.000000 | 3.342939 | 0.119986 | 0.000000 | 0.156253 | 0.000000 |
| human_rRNA | gene-RNA5S17 | RNA5S17 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.00000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.119986 | 0.000000 | 0.156253 | 0.000000 |
| human_rRNA | gene-RNA45SN2 | RNA45SN2 | 4.751703 | 2.105310 | 4.090869 | 2.859081 | 0.000000 | 7.056017 | 1532.79602 | 920.02587 | 193.091071 | 187.12187 | 23.858285 | 27.716852 | 11.439136 | 5.365124 | 7.821932 | 6.431999 |
| human_rRNA | gene-RNA18SN2 | RNA18SN2 | 62.342454 | 107.701193 | 200.195536 | 0.000000 | 296.675752 | 0.000000 | 0.00000 | 0.00000 | 220.510714 | 374.09125 | 351.102665 | 1677.113938 | 1064.578070 | 1535.906680 | 1762.266899 | 2096.663452 |
| human_rRNA | gene-RNA5-8SN2 | RNA5-8SN2 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.00000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 138.875585 |
| human_rRNA | gene-RNA28SN2 | RNA28SN2 | 190.946875 | 250.720809 | 201.681762 | 242.757269 | 219.348795 | 0.000000 | 0.00000 | 36.77702 | 1272.445684 | 1111.53791 | 1185.237687 | 1875.808091 | 1060.493248 | 1296.851623 | 1615.471929 | 1534.355232 |
| human_rRNA | gene-RNA45SN3 | RNA45SN3 | 0.997892 | 0.000000 | 0.000000 | 0.000000 | 0.268391 | 0.000000 | 139.91191 | 91.09562 | 33.129508 | 29.48614 | 4.590309 | 1.549567 | 0.924119 | 0.393336 | 0.000000 | 0.000000 |
| human_rRNA | gene-RNA18SN3 | RNA18SN3 | 62.012092 | 105.547238 | 196.740978 | 0.000000 | 295.746390 | 0.000000 | 0.00000 | 0.00000 | 219.883500 | 374.07612 | 347.512255 | 1671.307115 | 1078.803228 | 1542.499676 | 1753.969147 | 2086.600386 |
| human_rRNA | gene-RNA5-8SN3 | RNA5-8SN3 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.00000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 58.912857 |
| human_rRNA | gene-RNA28SN3 | RNA28SN3 | 16.865757 | 25.000134 | 29.234420 | 22.112708 | 29.729779 | 0.000000 | 0.00000 | 0.00000 | 0.000000 | 0.00000 | 0.000000 | 23.257654 | 17.733035 | 15.199022 | 20.575958 | 18.046203 |
| human_rRNA | gene-RNA45SN1 | RNA45SN1 | 6.039829 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 6.019315 | 604.00017 | 368.80178 | 76.638722 | 68.36348 | 7.017963 | 14.415548 | 5.925401 | 2.351978 | 1.894936 | 2.098510 |
| human_rRNA | gene-RNA5-8SN1 | RNA5-8SN1 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.00000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 31.234607 |
| human_rRNA | gene-RNA28SN1 | RNA28SN1 | 67.143854 | 82.323209 | 103.902309 | 82.308384 | 119.013023 | 0.000000 | 795.14549 | 1132.53586 | 1222.286618 | 1449.88852 | 1549.155050 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| human_rRNA | gene-RNA45SN4 | RNA45SN4 | 7.349364 | 5.157419 | 5.628581 | 5.095599 | 1.426797 | 10.922209 | 1756.19524 | 1065.05597 | 251.309378 | 236.85418 | 32.406743 | 52.335264 | 23.963442 | 9.690368 | 8.997157 | 6.814541 |
| human_rRNA | gene-RNA18SN4 | RNA18SN4 | 58.828665 | 111.066179 | 205.627316 | 0.000000 | 293.319650 | 0.000000 | 0.00000 | 0.00000 | 219.759981 | 370.27779 | 348.278460 | 1666.244320 | 1079.004751 | 1542.588846 | 1756.895843 | 2073.957543 |
| human_rRNA | gene-RNA5-8SN4 | RNA5-8SN4 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 32.860697 | 36.502763 | 0.00000 | 0.00000 | 0.000000 | 0.00000 | 0.000000 | 748.116995 | 523.601567 | 636.022692 | 563.180219 | 525.143478 |
| human_rRNA | gene-RNA28SN4 | RNA28SN4 | 488.534706 | 638.796031 | 567.890117 | 603.969644 | 619.882570 | 0.000000 | 1314.76873 | 1956.17536 | 3199.299038 | 3993.27833 | 4511.897218 | 4190.916438 | 2457.827433 | 2536.060195 | 3411.269658 | 3033.838207 |
| human_rRNA | gene-RNA45SN5 | RNA45SN5 | 0.000000 | 0.194935 | 0.000000 | 0.689502 | 0.187359 | 0.000000 | 11.11903 | 11.74383 | 8.504285 | 10.58738 | 5.416470 | 0.181703 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| human_rRNA | gene-RNA28SN5 | RNA28SN5 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.00000 | 0.000000 | 0.00000 | 23.041655 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| human_rRNA | gene-RNA5S17-2 | RNA5S17 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.00000 | 0.000000 | 0.00000 | 0.000000 | 7.800192 | 14.398334 | 14.534145 | 5.500117 | 11.832444 |
| human_rRNA | gene-RNA45SN3-2 | RNA45SN3 | 0.000000 | 3.504348 | 0.579441 | 0.561736 | 0.000000 | 1.190227 | 171.05619 | 113.23896 | 34.307079 | 34.74936 | 9.053317 | 0.392968 | 0.000000 | 0.501621 | 0.943800 | 0.933711 |
| human_rRNA | gene-RNA18SN3-2 | RNA18SN3 | 54.242554 | 114.466476 | 213.094452 | 760.232923 | 294.189391 | 0.000000 | 0.00000 | 0.00000 | 220.421934 | 367.76918 | 349.563430 | 1649.049824 | 1081.121546 | 1543.470708 | 1750.138192 | 2066.106213 |
| human_rRNA | gene-RNA28SN3-2 | RNA28SN3 | 15.566932 | 29.297394 | 30.742063 | 25.074316 | 28.841534 | 0.000000 | 0.00000 | 0.00000 | 0.000000 | 0.00000 | 0.000000 | 23.646839 | 17.483903 | 15.206186 | 20.562627 | 21.406914 |
| human_rRNA | gene-RNA45SN1-2 | RNA45SN1 | 5.973266 | 8.866557 | 3.016118 | 4.251982 | 0.589623 | 0.000000 | 731.71393 | 435.74070 | 88.820605 | 77.82077 | 8.493107 | 14.506503 | 5.872081 | 2.395843 | 5.115513 | 2.191921 |
| human_rRNA | gene-RNA28SN1-2 | RNA28SN1 | 69.232038 | 108.436101 | 102.541791 | 81.803956 | 122.395534 | 0.000000 | 994.92817 | 1309.04810 | 1320.097300 | 1613.90704 | 1654.843682 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| human_rRNA | gene-RNR1 | RNR1 | 14834.315751 | 16214.239057 | 6908.575576 | 3168.095844 | 8443.443125 | 10.732938 | 33.19660 | 30.11688 | 13.739731 | 29.93136 | 4.488337 | 3549.195879 | 3265.529789 | 462.500954 | 260.213996 | 337.772158 |
| human_rRNA | gene-RNR2 | RNR2 | 27104.637436 | 29430.692094 | 19293.910206 | 9832.230123 | 26033.024421 | 19.394133 | 65.09365 | 67.31826 | 30.271294 | 30.05422 | 21.338831 | 5784.927065 | 5240.821956 | 1166.936234 | 768.801099 | 990.511073 |
# check if GENCODE results are also present in directory
if (file.exists(here("data/ref/gene_types.csv"))) {
transcript_info_gencode <- read.csv(here("results/star_salmon/tx2gene.tsv"),
sep = "\t", header = FALSE,
col.names = c("tx", "gene_id", "gene_name")
)
transcript_info_gencode <- list(
transcript = transcript_info_gencode,
gene = unique(transcript_info_gencode[, 2:3]),
tx2gene = transcript_info_gencode[, 1:2]
)
gene_types_gencode_df <- read_csv2(here("data/ref/gene_types.csv")) %>%
left_join(transcript_info_gencode$gene, by = "gene_id")
gene_tpm_gencode <- read_delim(here("results/star_salmon/salmon.merged.gene_tpm.tsv"))
knitr::kable(
gene_types_gencode_df %>%
filter((gene_type == "human_rRNA") | (gene_type == "Mt_rRNA")) %>%
left_join(gene_tpm_gencode, by = c("gene_id", "gene_name")) %>%
filter(rowSums(across(starts_with("X"))) != 0),
caption = "Non-zero counts of human rRNA genes annotated in GENCODE"
)
}
| gene_type | gene_id | gene_name | X104974.001.001_S1_L001 | X104974.001.002_S1_L001 | X104974.001.003_S1_L001 | X104974.001.004_S1_L001 | X104974.001.005_S1_L001 | X104974.001.006_S1_L001 | X104974.001.007_S1_L001 | X104974.001.008_S1_L001 | X104974.001.009_S1_L001 | X104974.001.010_S1_L001 | X104974.001.011_S1_L001 | X104974.001.012_S1_L001 | X104974.001.013_S1_L001 | X104974.001.014_S1_L001 | X104974.001.015_S1_L001 | X104974.001.016_S1_L001 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| human_rRNA | ENSG00000199352.1 | RNA5S1 | 0.00000 | 0.0000 | 0.00000 | 0 | 0.00000 | 0.00000 | 0.00000 | 0.000000 | 0.0000 | 0.000 | 0.00000 | 0.00000 | 63.88345 | 0.000000 | 0.000000 | 0.000000 |
| human_rRNA | ENSG00000201588.1 | RNA5S2 | 0.00000 | 0.0000 | 0.00000 | 0 | 0.00000 | 0.00000 | 0.00000 | 0.000000 | 0.0000 | 0.000 | 0.00000 | 37.62973 | 0.00000 | 0.000000 | 0.000000 | 0.000000 |
| human_rRNA | ENSG00000200624.1 | RNA5S6 | 0.00000 | 0.0000 | 0.00000 | 0 | 0.00000 | 0.00000 | 0.00000 | 0.000000 | 0.0000 | 0.000 | 0.00000 | 0.00000 | 0.00000 | 49.459879 | 0.000000 | 0.000000 |
| human_rRNA | ENSG00000201321.1 | RNA5S9 | 0.00000 | 0.0000 | 0.00000 | 0 | 0.00000 | 0.00000 | 0.00000 | 4.530894 | 0.0000 | 0.000 | 0.00000 | 0.00000 | 0.00000 | 1.764045 | 0.564701 | 1.347898 |
| human_rRNA | ENSG00000202526.1 | RNA5S13 | 0.00000 | 0.0000 | 0.00000 | 0 | 0.00000 | 0.00000 | 0.00000 | 0.000000 | 0.0000 | 0.000 | 0.00000 | 0.00000 | 0.00000 | 0.000000 | 0.000000 | 37.116281 |
| human_rRNA | ENSG00000201355.1 | RNA5S14 | 0.00000 | 0.0000 | 0.00000 | 0 | 0.00000 | 0.00000 | 0.00000 | 0.000000 | 0.0000 | 0.000 | 0.00000 | 0.00000 | 0.00000 | 0.000000 | 48.057305 | 39.899804 |
| human_rRNA | ENSG00000199839.1 | RNA5SP150 | 0.00000 | 0.0000 | 0.00000 | 0 | 0.00000 | 0.00000 | 0.00000 | 0.000000 | 0.0000 | 0.000 | 0.00000 | 0.00000 | 0.00000 | 0.000000 | 0.000000 | 0.396039 |
| human_rRNA | ENSG00000283274.1 | 5_8S_rRNA | 0.00000 | 0.0000 | 0.00000 | 0 | 0.00000 | 0.00000 | 0.00000 | 0.000000 | 0.0000 | 0.000 | 0.00000 | 0.00000 | 0.00000 | 0.000000 | 0.000000 | 0.214992 |
| human_rRNA | ENSG00000283291.1 | 5_8S_rRNA | 0.00000 | 0.0000 | 0.00000 | 0 | 0.00000 | 0.00000 | 0.00000 | 0.000000 | 0.0000 | 0.000 | 0.00000 | 0.00000 | 0.00000 | 0.000000 | 0.000000 | 1.289955 |
| human_rRNA | ENSG00000278233.1 | RNA5-8SN2 | 0.00000 | 16.6997 | 0.00000 | 0 | 0.00000 | 0.00000 | 76.32146 | 0.000000 | 0.0000 | 0.000 | 82.78823 | 119.82333 | 180.19770 | 71.675414 | 54.502627 | 91.872277 |
| human_rRNA | ENSG00000277739.1 | 5_8S_rRNA | 0.00000 | 0.0000 | 0.00000 | 0 | 0.00000 | 0.00000 | 0.00000 | 80.526347 | 0.0000 | 0.000 | 0.00000 | 101.55505 | 90.56981 | 69.776516 | 58.522634 | 91.386372 |
| human_rRNA | ENSG00000275215.1 | RNA5-8SN3 | 11.15067 | 0.0000 | 18.24354 | 0 | 0.00000 | 0.00000 | 0.00000 | 0.000000 | 94.8767 | 0.000 | 0.00000 | 109.06286 | 88.00438 | 73.524728 | 62.021795 | 91.480219 |
| human_rRNA | ENSG00000278189.1 | RNA5-8SN1 | 0.00000 | 0.0000 | 0.00000 | 0 | 38.72457 | 0.00000 | 0.00000 | 0.000000 | 0.0000 | 0.000 | 0.00000 | 104.88716 | 43.51271 | 72.784121 | 73.098666 | 92.466613 |
| human_rRNA | ENSG00000276871.1 | 5_8S_rRNA | 0.00000 | 0.0000 | 0.00000 | 0 | 0.00000 | 0.00000 | 0.00000 | 0.000000 | 0.0000 | 0.000 | 0.00000 | 0.00000 | 0.00000 | 0.000000 | 0.000000 | 0.214992 |
| human_rRNA | ENSG00000274917.1 | RNA5-8SN5 | 0.00000 | 0.0000 | 0.00000 | 0 | 0.00000 | 0.00000 | 0.00000 | 0.000000 | 0.0000 | 0.000 | 0.00000 | 105.47408 | 28.43676 | 78.916641 | 77.600476 | 91.012288 |
| human_rRNA | ENSG00000273730.1 | 5_8S_rRNA | 0.00000 | 0.0000 | 0.00000 | 0 | 0.00000 | 0.00000 | 0.00000 | 0.000000 | 0.0000 | 0.000 | 0.00000 | 94.87708 | 34.08167 | 80.865489 | 82.939565 | 91.298725 |
| human_rRNA | ENSG00000278294.1 | 5_8S_rRNA | 0.00000 | 0.0000 | 0.00000 | 0 | 0.00000 | 0.00000 | 0.00000 | 0.000000 | 0.0000 | 0.000 | 0.00000 | 0.00000 | 0.00000 | 0.000000 | 0.000000 | 1.289955 |
| human_rRNA | ENSG00000276700.1 | RNA5-8SN4 | 0.00000 | 0.0000 | 0.00000 | 0 | 0.00000 | 0.00000 | 0.00000 | 0.000000 | 0.0000 | 102.071 | 0.00000 | 86.60765 | 36.65530 | 80.646396 | 79.449934 | 90.904625 |
| human_rRNA | ENSG00000275757.1 | 5_8S_rRNA | 0.00000 | 0.0000 | 0.00000 | 0 | 0.00000 | 66.87828 | 0.00000 | 0.000000 | 0.0000 | 0.000 | 0.00000 | 84.14490 | 44.42230 | 81.411437 | 80.039742 | 91.985737 |
gene_types_df %>%
left_join(gene_tpm, by = c("gene_id", "gene_name")) %>%
filter(rowSums(across(.cols = `104974-001-001_S1_L001`:`104974-001-016_S1_L001`)) != 0) %>%
pivot_longer(
cols = `104974-001-001_S1_L001`:`104974-001-016_S1_L001`,
names_to = "sample",
values_to = "counts"
) %>%
left_join(samples, by = "sample") %>%
group_by(gene_id) %>%
mutate(total = sum(counts)) %>%
ungroup() %>%
arrange(gene_type, desc(total)) %>%
mutate(gene_name = factor(gene_name, levels = unique(gene_name))) %>%
ggplot(aes(x = gene_name, y = counts, fill = sample_short)) +
geom_bar(position = "stack", stat = "identity") +
scale_fill_viridis(option = "viridis", discrete = T) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
coord_flip() +
scale_x_discrete(limits = rev) +
labs(fill = "Sample", title = "RefSeq annotated rRNA - TPM") +
ylab(label = "TPM") +
xlab(label = "Gene name") +
facet_wrap(~kit_name) +
scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale()))
gene_types_gencode_df %>%
left_join(gene_tpm_gencode, by = c("gene_id", "gene_name")) %>%
filter(rowSums(across(starts_with("X"))) != 0) %>%
pivot_longer(
cols = `X104974.001.001_S1_L001`:`X104974.001.016_S1_L001`,
names_to = "sample",
values_to = "counts"
) %>%
mutate(sample = str_replace_all(sample, "\\.", replacement = "-")) %>%
mutate(sample = str_sub(sample, 2, -1)) %>%
left_join(samples, by = "sample") %>%
group_by(gene_id) %>%
mutate(total = sum(counts)) %>%
ungroup() %>%
arrange(gene_type, desc(total)) %>%
mutate(gene_name = factor(gene_name, levels = unique(gene_name))) %>%
ggplot(aes(x = gene_name, y = counts, fill = sample_short)) +
geom_bar(position = "stack", stat = "identity") +
scale_fill_viridis(option = "viridis", discrete = T) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
coord_flip() +
scale_x_discrete(limits = rev) +
labs(fill = "Sample", title = "GENCODE annotated rRNA - TPM") +
ylab(label = "TPM") +
xlab(label = "Gene name") +
facet_wrap(~kit_name) +
scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale()))
For human globins, the following genes were included: “HBB”,“HBD”,“HBBP1”,“HBG1”,“HBG2”,“HBE1”,“HBZ”,“HBM”,“HBA2”,“HBA1”,“HBQ1”.
globin_names <- c("HBB", "HBD", "HBBP1", "HBG1", "HBG2", "HBE1", "HBZ", "HBM", "HBA2", "HBA1", "HBQ1")
globin_df <- tibble(gene_name = globin_names, gene_type = "human_globin") %>% left_join(transcript_info$gene, by = "gene_name")
gene_types_df <- gene_types_df %>% bind_rows(globin_df)
# export full gene type file
write_csv2(gene_types_df, here("data/ref-refseq-refseq/gene_types_R.csv"))
# some gene id share the same gene name
# gene_types_df %>%
# group_by(gene_name) %>%
# filter(n()>1)
Merge globin and rRNA annotations with gene annotation dataframe:
gene_annotation <- gene_annotation %>%
left_join(gene_types_df, by = c("gene_id", "gene_name")) %>%
mutate("gene_type" = coalesce(gene_type, species))
# merge gene_type annotation and fill in missing types using species name
annotate_gene_type <- function(data, gene_annotation) {
data %>%
left_join(gene_annotation, by = c("gene_id", "gene_name"))
}
gene_counts_txi <- annotate_gene_type(gene_counts_txi, gene_annotation)
gene_tpm <- annotate_gene_type(gene_tpm, gene_annotation)
# for SummarizedExperiment DESeq2 object, add annotations to rowData
rowData(gse) <- gene_annotation
# ensure genes are in the same order in annotation data and count files
# stopifnot(all(rownames(txi$abundance) == gene_annotation$gene_id))
stopifnot(all(rownames(gse) == gene_annotation$gene_id))
gse_pk <- gse[rowData(gse)$species == "pk", ]
# or
# pk_genes <- gene_annotation %>% filter(species == "pk") %>% select(gene_id) %>% pull()
# gse_pk <- gse[pk_genes]
The raw counts can optionally be pre-filtered to remove rows that carry little to no information on gene expression levels. The RNA-seq tutorial by Love et. al 2019 and the DESeq2 vignette recommended a minimum count of 10 reads for a minimal number of samples (e.g., the size of the smallest group). Since we do not have replicates for the different treatment groups, we instead opt for a minimum of 10 reads for at least a single sample, as a conservative filtering approach (e.g., only genes for which none of the samples have a higher read count than 10 are removed).
# create filter for all genes on the level of the DESeq/tximport gene counts
filter_all_genes <- rowSums(counts(gse) >= 10) >= 1
# should be the same as filtering on tximport raw gene counts from nf-core pipeline
stopifnot(all(filter_all_genes == (rowSums(gene_counts_txi[, 3:18] >= 10) >= 1)))
# create filtered versions of deseq2 objects: all genes, pk genes, pk protein-coding
gse_filtered <- gse[filter_all_genes, ]
gse_pk_filtered <- gse_filtered[rowData(gse_filtered)$species == "pk", ]
gse_pk_filtered_coding <- gse_filtered[rowData(gse_filtered)$species == "pk" & rowData(gse_filtered)$gene_biotype == "protein_coding", ]
# create filtered versions of other count matrices
gene_tpm_filtered <- gene_tpm[filter_all_genes, ]
gene_counts_txi_filtered <- gene_counts_txi[filter_all_genes, ]
# check if filter was applied correctly, i.e. order of genes is the same in gse and tpm matrix
stopifnot(all.equal(gene_tpm_filtered, gene_tpm[(rowSums(gene_counts_txi[, 3:18] >= 10) >= 1), ]))
stopifnot(gene_tpm$gene_id == rownames(gse))
stopifnot(gene_tpm_filtered$gene_id == rownames(gse_filtered))
stopifnot(all.equal(gene_counts_txi_filtered, gene_counts_txi[(rowSums(gene_counts_txi[, 3:18] >= 10) >= 1), ]))
stopifnot(gene_counts_txi$gene_id == rownames(gse))
stopifnot(gene_counts_txi$gene_id == gene_tpm$gene_id)
stopifnot(gene_counts_txi_filtered$gene_id == rownames(gse_filtered))
The number of genes were filtered down from 55743 to 27955 and from 5452 to 5147, for all genes and P. knowlesi genes respectively. So in conclusion, the effect is relatively minor for P. knowlesi, but a substantial number of human genes were removed.
It is difficult to determine an appropriate threshold for filtering on the TPM level, so instead we filter the same genes as on the raw count level (>=10 for at least 1 sample).
dim(gene_tpm)
## [1] 55743 21
mean.tpm <- apply(gene_tpm[, 3:18], 1, mean)
hist(log10(mean.tpm), main = "TPM before filtering")
# hist(log10(mean.tpm+1e-5))
dim(gene_tpm_filtered)
## [1] 27955 21
mean.tpm <- apply(gene_tpm_filtered[, 3:18], 1, mean)
hist(log10(mean.tpm), main = "TPM after filtering")
# cast to long format
to_long <- function(data) {
data %>%
pivot_longer(cols = `104974-001-001_S1_L001`:`104974-001-016_S1_L001`, names_to = "sample", values_to = "counts")
}
gene_counts_txi_long <- to_long(gene_counts_txi)
gene_counts_txi_filtered_long <- to_long(gene_counts_txi_filtered)
gene_tpm_long <- to_long(gene_tpm)
gene_tpm_filtered_long <- to_long(gene_tpm_filtered)
For visualisation purposes (e.g., PCA or heatmaps), gene counts
should be transformed so that they all have the same range of variances,
i.e. homoskedastic, instead of the variance depending on the mean.
DESeq2 offers two transform options to stabilize the variance,
vst and rlog. According to Love
et al., 2019, rlog, while slower, works better for small datasets (n
< 30) and when there are large differences in sequencing depth across
samples.
We use blind=TRUE to obtain a fully unsupervised
transformation (independent of the design, although we are using an
intercept-only model anyway), generally intended to observe batch
effects. However, since we can have large differences in counts between
samples due to our experimental design (e.g., zero counts for rRNA or
globins in some samples), the transformation might not work as well as
intended and shrink values too much (DESeq2
vignette).
Also note that attempting to transform unfiltered counts can lead to strange outcomes.
Lastly, the transformation should ideally be performed once on the entire set of gene counts (after filtering out low counts), and only then subset on different gene groups of interest (e.g., species, protein-coding genes only, etc.), rather than re-applying the transformation to a subset of counts. However, in actual experiments focused on P. knowlesi, researchers will also throw out human gene counts (e.g., by not even mapping to the human reference or simply discarding those reads). In that sense, evaluating the results by transforming only the genes of interest can be considered the better approach.
# # variance stabilizing transformation without filtering
# vsd <- vst(gse, blind = TRUE)
# p <- meanSdPlot(assay(vsd))
# p$gg + labs(title = "VST on unfiltered counts")
# extract transformed gene counts
# gene_counts_vst <- data.frame(assays(vsd)[[1]])
# colnames(gene_counts_vst) <- colData(gse)$sample
# gene_counts_vst <- gene_counts_vst %>%
# mutate(gene_id = rownames(gene_counts_vst)) %>%
# left_join(transcript_info$gene, by = "gene_id")
# variance stabilizing transform on filtered object
vsd_filtered <- vst(gse_filtered, blind = TRUE)
p <- meanSdPlot(assay(vsd_filtered))
p$gg + labs(title = "VST on filtered counts")
# compare with rlog transform on filtered object
rld_filtered <- rlog(gse_filtered, blind = TRUE)
p <- meanSdPlot(assay(rld_filtered))
p$gg + labs(title = "Rlog on filtered counts")
# transform after subsetting on species/protein-coding genes
# vsd_pk <- vst(gse_pk, blind = TRUE)
# p <- meanSdPlot(assay(vsd_pk))
# p$gg + labs(title = "VST on unfiltered counts after subsetting Pk genes")
vsd_pk_filtered <- vst(gse_pk_filtered, blind = TRUE)
p <- meanSdPlot(assay(vsd_pk_filtered))
p$gg + labs(title = "VST on filtered counts after subsetting Pk genes")
rld_pk_filtered <- rlog(gse_pk_filtered, blind = TRUE)
p <- meanSdPlot(assay(rld_pk_filtered))
p$gg + labs(title = "Rlog on filtered counts after subsetting Pk genes")
vsd_pk_filtered_coding <- vst(gse_pk_filtered_coding, blind = TRUE)
p <- meanSdPlot(assay(vsd_pk_filtered))
p$gg + labs(title = "VST on filtered counts after subsetting Pk protein-coding genes")
rld_pk_filtered_coding <- rlog(gse_pk_filtered_coding, blind = TRUE)
p <- meanSdPlot(assay(rld_pk_filtered))
p$gg + labs(title = "Rlog on filtered counts after subsetting Pk protein-coding genes")
# extract transformed gene counts
# gene_counts_vst_pk <- data.frame(assays(vsd_pk)[[1]])
# colnames(gene_counts_vst_pk) <- colData(vsd_pk)$sample
# gene_counts_vst_pk <- gene_counts_vst_pk %>%
# mutate(gene_id = rownames(gene_counts_vst_pk)) %>%
# left_join(transcript_info$gene, by = "gene_id")
Save all output to rds object for easier loading.
# dir.create(here("results-refseq/r_objects/"), showWarnings = FALSE)
#
# purrr::iwalk(
# tibble:::lst(
# fastq_screen,
# fastq_screen_mapping_stats_df,
# gene_annotation,
# gene_counts_txi,
# gene_counts_txi_long,
# gene_counts_txi_filtered,
# gene_counts_txi_filtered_long,
# gene_tpm,
# gene_tpm_long,
# gene_tpm_filtered,
# gene_tpm_filtered_long,
# gse,
# gse_filtered,
# gse_pk,
# gse_pk_filtered,
# mapping_stats_df,
# pcr_dups,
# samples,
# star_stats,
# # trancsript_info,
# rld_filtered,
# rld_pk_filtered,
# rld_pk_filtered_coding,
# vsd_filtered,
# vsd_pk_filtered,
# vsd_pk_filtered_coding,
# to_long,
# ),
# ~ saveRDS(.x, file = here(paste0("results-refseq/r_objects/", .y, ".RDS")))
# )
Data clean up
# clean up
rm(gene_types_df)
# rm(transcript_info)
rm(globin_df)
rm(gene_tpm_gencode)
rm(gene_types_gencode_df)
rm(transcript_info_gencode)
rm(gff)
rm(p)
rm(filter_all_genes)
dir.create(here("results-refseq/figures/"), showWarnings = FALSE)
Number of genes per species:
gene_counts_txi %>%
group_by(species) %>%
summarise(n = n()) %>%
ungroup() %>%
mutate(total = sum(n), rel.freq = n / total)
Number of genes per species after filtering low counts (n < 10 for all samples):
gene_counts_txi_filtered %>%
group_by(species) %>%
summarise(n = n()) %>%
ungroup() %>%
mutate(total = sum(n), rel.freq = n / total)
Number of gene types per species:
gene_counts_txi %>%
group_by(species, gene_type) %>%
summarise(n = n()) %>%
ungroup() %>%
mutate(total = sum(n), rel.freq = round(n / total, 5))
## `summarise()` has grouped output by 'species'. You can override using the
## `.groups` argument.
Number of gene types per species after filtering out low counts (n < 10 for all samples):
gene_counts_txi_filtered %>%
group_by(species, gene_type) %>%
summarise(n = n()) %>%
ungroup() %>%
mutate(total = sum(n), rel.freq = round(n / total, 5))
## `summarise()` has grouped output by 'species'. You can override using the
## `.groups` argument.
n_genes <- dim(gene_counts_txi)[1]
# stopifnot(dim(gene_counts_txi)[1] == (gene_counts_salmon %>% group_by(sample) %>% summarise(n = n()) %>% select(n) %>% unique()))
n_genes_filtered <- dim(gse_filtered)[1]
n_human_genes <- gene_counts_txi %>%
filter(species == "human") %>%
count() %>%
pull()
n_human_genes_filtered <- gene_counts_txi_filtered %>%
filter(species == "human") %>%
count() %>%
pull()
n_pk_genes <- dim(gse_pk)[1]
stopifnot(dim(gse_pk)[1] == (gene_counts_txi %>% filter(species == "pk") %>% count()))
n_pk_genes_filtered <- dim(gse_pk_filtered)[1]
stopifnot(dim(gse_pk_filtered)[1] == (gene_counts_txi_filtered %>% filter(species == "pk") %>% count()))
The direct comparison of RPKM and TPM across samples is meaningful only when there are equal total RNAs between compared samples and the distribution of RNA populations are close to each other.
Make sure both samples use the same RNA isolation approach [poly(A)+ selection versus ribosomal RNA depletion]. If not, they should not be compared.
Check the fraction of the ribosomal, mitochondrial and globin RNAs, and the top highly expressed transcripts and see whether such RNAs constitute a very large part of the sequenced reads in a sample, and thus decrease the sequencing “real estate” available for the remaining genes in that sample. If the calculated fractions in two samples differ significantly, do not compare RPKM or TPM values directly.
TPM should never be used for quantitative comparisons across samples when the total RNA contents and its distributions are very different. However, under appropriate circumstances, TPM can still be useful for qualitative comparison such as PCA and clustering analysis.
(Zhao et al., 2020 - http://dx.doi.org/10.1261/rna.074922.120)
Looking at our samples, especially the ones prepared using the Illumina kit exhibit much higher total RNA content. Based on the output of Picard MarkDuplicates, the differences can at least partially be attributed to the number of PCR duplicates. The proportion of duplicates is comparable across kits, but they do appear to be consistently higher for some depletion methods (no filter > centpip > plasmo > pmacs > cellulose).
gene_counts_txi_filtered %>%
summarise(across(`104974-001-001_S1_L001`:`104974-001-016_S1_L001`, sum)) %>%
to_long() %>%
left_join(samples, by = "sample") %>%
ggplot(aes(x = sample_short, y = counts, fill = kit_name)) +
geom_bar(stat = "identity") +
scale_fill_viridis(option = "viridis", discrete = T) +
# labels = c("mRNA Illumina", "mRNA Qiagen globin depletion", "mRNA Qiagen globin/rRNA depletion", "tRNA Qiagen globin/rRNA depletion")) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
labs(x = "Sample", y = "Number of reads", fill = "Kit")
pcr_dups %>%
left_join(samples, by = join_by(Sample == sample)) %>%
mutate(READS_IN_DUPLICATE_PAIRS = 2 * READ_PAIR_DUPLICATES) %>%
mutate(READS_IN_UNIQUE_PAIRS = 2 * (READ_PAIRS_EXAMINED - READ_PAIR_DUPLICATES)) %>%
mutate(READS_IN_UNIQUE_UNPAIRED = UNPAIRED_READS_EXAMINED - UNPAIRED_READ_DUPLICATES) %>%
mutate(READS_IN_DUPLICATE_PAIRS_OPTICAL = 2 * READ_PAIR_OPTICAL_DUPLICATES) %>%
mutate(READS_IN_DUPLICATE_PAIRS_NONOPTICAL = READS_IN_DUPLICATE_PAIRS - READS_IN_DUPLICATE_PAIRS_OPTICAL) %>%
mutate(READS_IN_DUPLICATE_UNPAIRED = UNPAIRED_READ_DUPLICATES) %>%
mutate(READS_UNMAPPED = UNMAPPED_READS) %>%
select(c(sample_name, condition, kit, READS_IN_UNIQUE_PAIRS, READS_IN_UNIQUE_UNPAIRED, READS_IN_DUPLICATE_PAIRS_OPTICAL, READS_IN_DUPLICATE_PAIRS_NONOPTICAL, READS_IN_DUPLICATE_UNPAIRED)) %>%
pivot_longer(cols = 4:8, names_to = "type", values_to = "counts") %>%
ggplot(aes(x = sample_name, y = counts, fill = type)) +
geom_bar(position = "stack", stat = "identity") +
scale_fill_viridis(option = "viridis", discrete = T) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
coord_flip() +
scale_x_discrete(limits = rev) +
scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
labs(x = "Sample", y = "Number of reads", fill = "Picard deduplication stats")
The output reported by samtool flagstats and STAR’s
Log.final.out files cannot be compared directly: the latter
reports counts of read-pairs as opposed to individual reads, and the
former does not contain unmapped reads (these are not added to the
.bam outputs by default) while also counting multi-mapped
reads. We opted to report the STAR statistics, but the flagstat output
is included for completion’s sake.
star_stats
star_stats %>%
pivot_longer(cols = 2:6, names_to = "type", values_to = "counts") %>%
left_join(samples, by = join_by(Sample == sample)) %>%
ggplot(aes(x = sample_name, y = counts, fill = type)) +
geom_bar(position = "stack", stat = "identity") +
scale_fill_viridis(option = "viridis", discrete = T) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
coord_flip() +
scale_x_discrete(limits = rev) +
scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
labs(x = "Sample", y = "Number of read pairs", fill = "Mapping stats")
mapping_stats_df
mapping_stats_df %>%
left_join(samples, by = "sample") %>%
pivot_longer(cols = primary_mapped_pk:primary_mapped_human, names_to = "type", values_to = "counts") %>%
ggplot(aes(x = sample_name, y = counts, fill = type)) +
geom_bar(position = "stack", stat = "identity") +
scale_fill_viridis(option = "viridis", discrete = T) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
coord_flip() +
scale_x_discrete(limits = rev) +
scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
labs(x = "Sample", y = "Number of primary reads", fill = "Samtools flagstat")
| gene_id | gene_name | gene_biotype | species | gene_type | sample | counts |
|---|---|---|---|---|---|---|
| gene-B2M | B2M | NA | human | human | 104974-001-001_S1_L001 | 27679.223 |
| gene-RNR2 | RNR2 | NA | human | human_rRNA | 104974-001-001_S1_L001 | 27104.637 |
| gene-S100A8 | S100A8 | NA | human | human | 104974-001-001_S1_L001 | 22578.637 |
| gene-S100A9 | S100A9 | NA | human | human | 104974-001-001_S1_L001 | 20902.902 |
| gene-ND3 | ND3 | NA | human | human | 104974-001-001_S1_L001 | 19021.278 |
| gene-RPS27 | RPS27 | NA | human | human | 104974-001-001_S1_L001 | 17762.379 |
| gene-RNR1 | RNR1 | NA | human | human_rRNA | 104974-001-001_S1_L001 | 14834.316 |
| gene-COX3 | COX3 | NA | human | human | 104974-001-001_S1_L001 | 14662.673 |
| gene-ATP6 | ATP6 | NA | human | human | 104974-001-001_S1_L001 | 11625.447 |
| gene-TMSB4X | TMSB4X | NA | human | human | 104974-001-001_S1_L001 | 11335.273 |
| gene-FTL | FTL | NA | human | human | 104974-001-001_S1_L001 | 10562.604 |
| gene-COX2 | COX2 | NA | human | human | 104974-001-001_S1_L001 | 10466.481 |
| gene-RPL41 | RPL41 | NA | human | human | 104974-001-001_S1_L001 | 9533.806 |
| gene-ND2 | ND2 | NA | human | human | 104974-001-001_S1_L001 | 9363.211 |
| gene-RPLP2 | RPLP2 | NA | human | human | 104974-001-001_S1_L001 | 9110.287 |
| gene-RPS20 | RPS20 | NA | human | human | 104974-001-001_S1_L001 | 8465.206 |
| gene-RPL21 | RPL21 | NA | human | human | 104974-001-001_S1_L001 | 8463.110 |
| gene-RPS12 | RPS12 | NA | human | human | 104974-001-001_S1_L001 | 7906.045 |
| gene-COX1 | COX1 | NA | human | human | 104974-001-001_S1_L001 | 7773.208 |
| gene-ND1 | ND1 | NA | human | human | 104974-001-001_S1_L001 | 7213.298 |
| gene-RPS21 | RPS21 | NA | human | human | 104974-001-001_S1_L001 | 7198.383 |
| gene-EEF1A1 | EEF1A1 | NA | human | human | 104974-001-001_S1_L001 | 6880.381 |
| gene-RPS24 | RPS24 | NA | human | human | 104974-001-001_S1_L001 | 6067.828 |
| gene-RPL30 | RPL30 | NA | human | human | 104974-001-001_S1_L001 | 5941.840 |
| gene-B2M-2 | B2M | NA | human | human | 104974-001-001_S1_L001 | 5852.971 |
| gene_id | gene_name | gene_biotype | species | gene_type | sample | counts |
|---|---|---|---|---|---|---|
| gene-B2M | B2M | NA | human | human | 104974-001-002_S1_L001 | 34855.539 |
| gene-RNR2 | RNR2 | NA | human | human_rRNA | 104974-001-002_S1_L001 | 29430.692 |
| gene-S100A9 | S100A9 | NA | human | human | 104974-001-002_S1_L001 | 24583.661 |
| gene-S100A8 | S100A8 | NA | human | human | 104974-001-002_S1_L001 | 24327.359 |
| gene-ND3 | ND3 | NA | human | human | 104974-001-002_S1_L001 | 16508.778 |
| gene-RNR1 | RNR1 | NA | human | human_rRNA | 104974-001-002_S1_L001 | 16214.239 |
| gene-COX3 | COX3 | NA | human | human | 104974-001-002_S1_L001 | 15349.098 |
| gene-RPS27 | RPS27 | NA | human | human | 104974-001-002_S1_L001 | 13599.609 |
| gene-FTL | FTL | NA | human | human | 104974-001-002_S1_L001 | 13013.354 |
| gene-TMSB4X | TMSB4X | NA | human | human | 104974-001-002_S1_L001 | 11699.378 |
| gene-ATP6 | ATP6 | NA | human | human | 104974-001-002_S1_L001 | 11343.645 |
| gene-COX2 | COX2 | NA | human | human | 104974-001-002_S1_L001 | 10378.688 |
| gene-ND2 | ND2 | NA | human | human | 104974-001-002_S1_L001 | 9614.317 |
| gene-RPL21 | RPL21 | NA | human | human | 104974-001-002_S1_L001 | 8803.669 |
| gene-RPL41 | RPL41 | NA | human | human | 104974-001-002_S1_L001 | 8432.933 |
| gene-COX1 | COX1 | NA | human | human | 104974-001-002_S1_L001 | 8236.118 |
| gene-RPS12 | RPS12 | NA | human | human | 104974-001-002_S1_L001 | 8169.655 |
| gene-RPLP2 | RPLP2 | NA | human | human | 104974-001-002_S1_L001 | 7483.747 |
| gene-ND1 | ND1 | NA | human | human | 104974-001-002_S1_L001 | 7330.897 |
| gene-EEF1A1 | EEF1A1 | NA | human | human | 104974-001-002_S1_L001 | 7075.555 |
| gene-B2M-2 | B2M | NA | human | human | 104974-001-002_S1_L001 | 7055.202 |
| gene-HBA2 | HBA2 | NA | human | human_globin | 104974-001-002_S1_L001 | 6973.821 |
| gene-RPS20 | RPS20 | NA | human | human | 104974-001-002_S1_L001 | 6841.469 |
| gene-SRGN | SRGN | NA | human | human | 104974-001-002_S1_L001 | 6536.149 |
| gene-ND4 | ND4 | NA | human | human | 104974-001-002_S1_L001 | 5657.471 |
| gene_id | gene_name | gene_biotype | species | gene_type | sample | counts |
|---|---|---|---|---|---|---|
| gene-HBA2 | HBA2 | NA | human | human_globin | 104974-001-003_S1_L001 | 46533.376 |
| gene-FTL | FTL | NA | human | human | 104974-001-003_S1_L001 | 29467.269 |
| gene-HBB | HBB | NA | human | human_globin | 104974-001-003_S1_L001 | 27466.991 |
| gene-RPL21 | RPL21 | NA | human | human | 104974-001-003_S1_L001 | 23643.612 |
| gene-UBA52 | UBA52 | NA | human | human | 104974-001-003_S1_L001 | 21219.997 |
| gene-RPS12 | RPS12 | NA | human | human | 104974-001-003_S1_L001 | 20864.756 |
| gene-RNR2 | RNR2 | NA | human | human_rRNA | 104974-001-003_S1_L001 | 19293.910 |
| gene-UBB | UBB | NA | human | human | 104974-001-003_S1_L001 | 17736.970 |
| gene-HBA1 | HBA1 | NA | human | human_globin | 104974-001-003_S1_L001 | 16262.189 |
| gene-B2M | B2M | NA | human | human | 104974-001-003_S1_L001 | 14549.406 |
| gene-GYPC | GYPC | NA | human | human | 104974-001-003_S1_L001 | 12990.299 |
| gene-RPL41 | RPL41 | NA | human | human | 104974-001-003_S1_L001 | 11766.569 |
| gene-OAZ1 | OAZ1 | NA | human | human | 104974-001-003_S1_L001 | 9783.310 |
| gene-FKBP8 | FKBP8 | NA | human | human | 104974-001-003_S1_L001 | 8678.414 |
| gene-S100A8 | S100A8 | NA | human | human | 104974-001-003_S1_L001 | 7978.580 |
| gene-S100A9 | S100A9 | NA | human | human | 104974-001-003_S1_L001 | 7889.953 |
| gene-SLC25A39 | SLC25A39 | NA | human | human | 104974-001-003_S1_L001 | 7762.571 |
| gene-RPS27 | RPS27 | NA | human | human | 104974-001-003_S1_L001 | 7712.087 |
| gene-RPLP2 | RPLP2 | NA | human | human | 104974-001-003_S1_L001 | 7343.200 |
| gene-RNR1 | RNR1 | NA | human | human_rRNA | 104974-001-003_S1_L001 | 6908.576 |
| gene-RPL30 | RPL30 | NA | human | human | 104974-001-003_S1_L001 | 6190.096 |
| gene-COX3 | COX3 | NA | human | human | 104974-001-003_S1_L001 | 6067.999 |
| gene-RPS11 | RPS11 | NA | human | human | 104974-001-003_S1_L001 | 5505.873 |
| gene-ND3 | ND3 | NA | human | human | 104974-001-003_S1_L001 | 5258.114 |
| gene-PKNH_0418600 | gene-PKNH_0418600 | protein_coding | pk | pk | 104974-001-003_S1_L001 | 5022.868 |
| gene_id | gene_name | gene_biotype | species | gene_type | sample | counts |
|---|---|---|---|---|---|---|
| gene-HBA2 | HBA2 | NA | human | human_globin | 104974-001-004_S1_L001 | 52755.665 |
| gene-HBB | HBB | NA | human | human_globin | 104974-001-004_S1_L001 | 22097.339 |
| gene-HBA1 | HBA1 | NA | human | human_globin | 104974-001-004_S1_L001 | 21210.681 |
| gene-FTL | FTL | NA | human | human | 104974-001-004_S1_L001 | 19832.883 |
| gene-UBA52 | UBA52 | NA | human | human | 104974-001-004_S1_L001 | 13148.031 |
| gene-RPL21 | RPL21 | NA | human | human | 104974-001-004_S1_L001 | 12112.246 |
| gene-RPS12 | RPS12 | NA | human | human | 104974-001-004_S1_L001 | 11859.816 |
| gene-B2M | B2M | NA | human | human | 104974-001-004_S1_L001 | 10750.176 |
| gene-UBB | UBB | NA | human | human | 104974-001-004_S1_L001 | 9926.561 |
| gene-RNR2 | RNR2 | NA | human | human_rRNA | 104974-001-004_S1_L001 | 9832.230 |
| gene-GYPC | GYPC | NA | human | human | 104974-001-004_S1_L001 | 8986.872 |
| gene-PKNH_0418600 | gene-PKNH_0418600 | protein_coding | pk | pk | 104974-001-004_S1_L001 | 8314.573 |
| gene-S100A8 | S100A8 | NA | human | human | 104974-001-004_S1_L001 | 7066.256 |
| gene-S100A9 | S100A9 | NA | human | human | 104974-001-004_S1_L001 | 7006.419 |
| gene-FKBP8 | FKBP8 | NA | human | human | 104974-001-004_S1_L001 | 6986.322 |
| gene-COX3 | COX3 | NA | human | human | 104974-001-004_S1_L001 | 6966.357 |
| gene-OAZ1 | OAZ1 | NA | human | human | 104974-001-004_S1_L001 | 6785.591 |
| gene-RPL41 | RPL41 | NA | human | human | 104974-001-004_S1_L001 | 6779.247 |
| gene-COX1 | COX1 | NA | human | human | 104974-001-004_S1_L001 | 6315.816 |
| gene-RPS27 | RPS27 | NA | human | human | 104974-001-004_S1_L001 | 6261.843 |
| gene-SLC25A39 | SLC25A39 | NA | human | human | 104974-001-004_S1_L001 | 6113.571 |
| gene-ATP6 | ATP6 | NA | human | human | 104974-001-004_S1_L001 | 6088.562 |
| gene-ND3 | ND3 | NA | human | human | 104974-001-004_S1_L001 | 5573.493 |
| gene-COX2 | COX2 | NA | human | human | 104974-001-004_S1_L001 | 5157.936 |
| gene-RPLP2 | RPLP2 | NA | human | human | 104974-001-004_S1_L001 | 5087.297 |
| gene_id | gene_name | gene_biotype | species | gene_type | sample | counts |
|---|---|---|---|---|---|---|
| gene-HBA2 | HBA2 | NA | human | human_globin | 104974-001-005_S1_L001 | 49591.793 |
| gene-FTL | FTL | NA | human | human | 104974-001-005_S1_L001 | 44351.515 |
| gene-UBA52 | UBA52 | NA | human | human | 104974-001-005_S1_L001 | 36767.580 |
| gene-RPL21 | RPL21 | NA | human | human | 104974-001-005_S1_L001 | 35030.120 |
| gene-RPS12 | RPS12 | NA | human | human | 104974-001-005_S1_L001 | 30090.973 |
| gene-HBB | HBB | NA | human | human_globin | 104974-001-005_S1_L001 | 29687.257 |
| gene-UBB | UBB | NA | human | human | 104974-001-005_S1_L001 | 26975.937 |
| gene-RNR2 | RNR2 | NA | human | human_rRNA | 104974-001-005_S1_L001 | 26033.024 |
| gene-GYPC | GYPC | NA | human | human | 104974-001-005_S1_L001 | 20473.960 |
| gene-RPL41 | RPL41 | NA | human | human | 104974-001-005_S1_L001 | 17038.159 |
| gene-HBA1 | HBA1 | NA | human | human_globin | 104974-001-005_S1_L001 | 16668.188 |
| gene-OAZ1 | OAZ1 | NA | human | human | 104974-001-005_S1_L001 | 14476.128 |
| gene-FKBP8 | FKBP8 | NA | human | human | 104974-001-005_S1_L001 | 13027.216 |
| gene-SLC25A39 | SLC25A39 | NA | human | human | 104974-001-005_S1_L001 | 12949.204 |
| gene-RNR1 | RNR1 | NA | human | human_rRNA | 104974-001-005_S1_L001 | 8443.443 |
| gene-RPLP2 | RPLP2 | NA | human | human | 104974-001-005_S1_L001 | 7333.275 |
| gene-RPS11 | RPS11 | NA | human | human | 104974-001-005_S1_L001 | 7058.348 |
| gene-PKNH_0418600 | gene-PKNH_0418600 | protein_coding | pk | pk | 104974-001-005_S1_L001 | 6826.241 |
| gene-RPL30 | RPL30 | NA | human | human | 104974-001-005_S1_L001 | 6393.033 |
| gene-GUK1 | GUK1 | NA | human | human | 104974-001-005_S1_L001 | 6026.648 |
| gene-RPL12 | RPL12 | NA | human | human | 104974-001-005_S1_L001 | 5719.974 |
| gene-ADIPOR1 | ADIPOR1 | NA | human | human | 104974-001-005_S1_L001 | 5600.955 |
| gene-PKNH_1355500 | gene-PKNH_1355500 | protein_coding | pk | pk | 104974-001-005_S1_L001 | 5244.759 |
| gene-RPS15 | RPS15 | NA | human | human | 104974-001-005_S1_L001 | 4708.231 |
| gene-RPS24 | RPS24 | NA | human | human | 104974-001-005_S1_L001 | 4399.440 |
| gene_id | gene_name | gene_biotype | species | gene_type | sample | counts |
|---|---|---|---|---|---|---|
| gene-B2M | B2M | NA | human | human | 104974-001-006_S1_L001 | 28553.397 |
| gene-COX3 | COX3 | NA | human | human | 104974-001-006_S1_L001 | 19236.293 |
| gene-S100A9 | S100A9 | NA | human | human | 104974-001-006_S1_L001 | 18303.677 |
| gene-TMSB4X | TMSB4X | NA | human | human | 104974-001-006_S1_L001 | 18124.259 |
| gene-S100A8 | S100A8 | NA | human | human | 104974-001-006_S1_L001 | 16150.164 |
| gene-B2M-2 | B2M | NA | human | human | 104974-001-006_S1_L001 | 15785.354 |
| gene-ND3 | ND3 | NA | human | human | 104974-001-006_S1_L001 | 15074.849 |
| gene-COX1 | COX1 | NA | human | human | 104974-001-006_S1_L001 | 14709.890 |
| gene-COX2 | COX2 | NA | human | human | 104974-001-006_S1_L001 | 14454.597 |
| gene-ND2 | ND2 | NA | human | human | 104974-001-006_S1_L001 | 14204.682 |
| gene-FTL | FTL | NA | human | human | 104974-001-006_S1_L001 | 11557.118 |
| gene-ND1 | ND1 | NA | human | human | 104974-001-006_S1_L001 | 10850.009 |
| gene-ATP6 | ATP6 | NA | human | human | 104974-001-006_S1_L001 | 10710.461 |
| gene-RPS27 | RPS27 | NA | human | human | 104974-001-006_S1_L001 | 9942.583 |
| gene-ND4 | ND4 | NA | human | human | 104974-001-006_S1_L001 | 9064.038 |
| gene-ATP8 | ATP8 | NA | human | human | 104974-001-006_S1_L001 | 8615.542 |
| gene-EEF1A1 | EEF1A1 | NA | human | human | 104974-001-006_S1_L001 | 8459.874 |
| gene-RPS20 | RPS20 | NA | human | human | 104974-001-006_S1_L001 | 7580.700 |
| gene-CYTB | CYTB | NA | human | human | 104974-001-006_S1_L001 | 7165.345 |
| gene-RPL41 | RPL41 | NA | human | human | 104974-001-006_S1_L001 | 7005.028 |
| gene-RPL19 | RPL19 | NA | human | human | 104974-001-006_S1_L001 | 6904.254 |
| gene-RPL21 | RPL21 | NA | human | human | 104974-001-006_S1_L001 | 6793.811 |
| gene-RPL34 | RPL34 | NA | human | human | 104974-001-006_S1_L001 | 5924.000 |
| gene-RPL9 | RPL9 | NA | human | human | 104974-001-006_S1_L001 | 5814.001 |
| gene-FTH1 | FTH1 | NA | human | human | 104974-001-006_S1_L001 | 5571.215 |
| gene_id | gene_name | gene_biotype | species | gene_type | sample | counts |
|---|---|---|---|---|---|---|
| gene-RN7SL2 | RN7SL2 | NA | human | human | 104974-001-007_S1_L001 | 131068.139 |
| gene-RN7SL1 | RN7SL1 | NA | human | human | 104974-001-007_S1_L001 | 92664.227 |
| gene-PKNH_0321100 | gene-PKNH_0321100 | rRNA | pk | pk_rRNA | 104974-001-007_S1_L001 | 73099.379 |
| gene-RN7SK | RN7SK | NA | human | human | 104974-001-007_S1_L001 | 70738.871 |
| gene-PKNH_1001000 | gene-PKNH_1001000 | rRNA | pk | pk_rRNA | 104974-001-007_S1_L001 | 58787.433 |
| gene-PKNH_0320900 | gene-PKNH_0320900 | rRNA | pk | pk_rRNA | 104974-001-007_S1_L001 | 27365.645 |
| gene-PKNH_1001200 | gene-PKNH_1001200 | rRNA | pk | pk_rRNA | 104974-001-007_S1_L001 | 25716.594 |
| gene-PKNH_0321000 | gene-PKNH_0321000 | rRNA | pk | pk_rRNA | 104974-001-007_S1_L001 | 17270.634 |
| gene-RMRP | RMRP | NA | human | human | 104974-001-007_S1_L001 | 14366.099 |
| gene-B2M | B2M | NA | human | human | 104974-001-007_S1_L001 | 12165.333 |
| gene-PKNH_1339200 | gene-PKNH_1339200 | ncRNA | pk | pk | 104974-001-007_S1_L001 | 8712.438 |
| gene-TMSB4X | TMSB4X | NA | human | human | 104974-001-007_S1_L001 | 7823.012 |
| gene-COX3 | COX3 | NA | human | human | 104974-001-007_S1_L001 | 7685.253 |
| gene-PKNH_1001100 | gene-PKNH_1001100 | rRNA | pk | pk_rRNA | 104974-001-007_S1_L001 | 7579.708 |
| gene-ND2 | ND2 | NA | human | human | 104974-001-007_S1_L001 | 6852.100 |
| gene-COX1 | COX1 | NA | human | human | 104974-001-007_S1_L001 | 6031.814 |
| gene-COX2 | COX2 | NA | human | human | 104974-001-007_S1_L001 | 5463.136 |
| gene-ATP6 | ATP6 | NA | human | human | 104974-001-007_S1_L001 | 5451.582 |
| gene-S100A8 | S100A8 | NA | human | human | 104974-001-007_S1_L001 | 5309.715 |
| gene-ND1 | ND1 | NA | human | human | 104974-001-007_S1_L001 | 5106.975 |
| gene-ND4 | ND4 | NA | human | human | 104974-001-007_S1_L001 | 4979.826 |
| gene-PKNH_0935200 | gene-PKNH_0935200 | snRNA | pk | pk | 104974-001-007_S1_L001 | 4468.936 |
| gene-ND3 | ND3 | NA | human | human | 104974-001-007_S1_L001 | 4455.028 |
| gene-S100A9 | S100A9 | NA | human | human | 104974-001-007_S1_L001 | 4377.385 |
| gene-RPPH1 | RPPH1 | NA | human | human | 104974-001-007_S1_L001 | 4178.458 |
| gene_id | gene_name | gene_biotype | species | gene_type | sample | counts |
|---|---|---|---|---|---|---|
| gene-RN7SL2 | RN7SL2 | NA | human | human | 104974-001-008_S1_L001 | 139279.456 |
| gene-RN7SL1 | RN7SL1 | NA | human | human | 104974-001-008_S1_L001 | 88237.728 |
| gene-PKNH_1001000 | gene-PKNH_1001000 | rRNA | pk | pk_rRNA | 104974-001-008_S1_L001 | 78313.591 |
| gene-PKNH_0321100 | gene-PKNH_0321100 | rRNA | pk | pk_rRNA | 104974-001-008_S1_L001 | 76693.896 |
| gene-RN7SK | RN7SK | NA | human | human | 104974-001-008_S1_L001 | 76096.199 |
| gene-PKNH_0320900 | gene-PKNH_0320900 | rRNA | pk | pk_rRNA | 104974-001-008_S1_L001 | 30296.682 |
| gene-PKNH_1001200 | gene-PKNH_1001200 | rRNA | pk | pk_rRNA | 104974-001-008_S1_L001 | 27906.598 |
| gene-PKNH_0321000 | gene-PKNH_0321000 | rRNA | pk | pk_rRNA | 104974-001-008_S1_L001 | 23171.875 |
| gene-RMRP | RMRP | NA | human | human | 104974-001-008_S1_L001 | 16032.224 |
| gene-PKNH_1001100 | gene-PKNH_1001100 | rRNA | pk | pk_rRNA | 104974-001-008_S1_L001 | 14289.531 |
| gene-B2M | B2M | NA | human | human | 104974-001-008_S1_L001 | 12222.046 |
| gene-PKNH_1339200 | gene-PKNH_1339200 | ncRNA | pk | pk | 104974-001-008_S1_L001 | 9740.299 |
| gene-PKNH_0935200 | gene-PKNH_0935200 | snRNA | pk | pk | 104974-001-008_S1_L001 | 7697.558 |
| gene-COX3 | COX3 | NA | human | human | 104974-001-008_S1_L001 | 7411.967 |
| gene-TMSB4X | TMSB4X | NA | human | human | 104974-001-008_S1_L001 | 6867.503 |
| gene-ND2 | ND2 | NA | human | human | 104974-001-008_S1_L001 | 5897.639 |
| gene-COX1 | COX1 | NA | human | human | 104974-001-008_S1_L001 | 5487.208 |
| gene-S100A8 | S100A8 | NA | human | human | 104974-001-008_S1_L001 | 5483.937 |
| gene-COX2 | COX2 | NA | human | human | 104974-001-008_S1_L001 | 5235.476 |
| gene-RPPH1 | RPPH1 | NA | human | human | 104974-001-008_S1_L001 | 5041.748 |
| gene-RPPH1-2 | RPPH1 | NA | human | human | 104974-001-008_S1_L001 | 4996.489 |
| gene-ATP6 | ATP6 | NA | human | human | 104974-001-008_S1_L001 | 4855.997 |
| gene-S100A9 | S100A9 | NA | human | human | 104974-001-008_S1_L001 | 4804.481 |
| gene-ND1 | ND1 | NA | human | human | 104974-001-008_S1_L001 | 4769.538 |
| gene-ND4 | ND4 | NA | human | human | 104974-001-008_S1_L001 | 4491.039 |
| gene_id | gene_name | gene_biotype | species | gene_type | sample | counts |
|---|---|---|---|---|---|---|
| gene-RN7SL2 | RN7SL2 | NA | human | human | 104974-001-009_S1_L001 | 145681.116 |
| gene-PKNH_0321100 | gene-PKNH_0321100 | rRNA | pk | pk_rRNA | 104974-001-009_S1_L001 | 144323.085 |
| gene-PKNH_1001000 | gene-PKNH_1001000 | rRNA | pk | pk_rRNA | 104974-001-009_S1_L001 | 113693.416 |
| gene-RN7SL1 | RN7SL1 | NA | human | human | 104974-001-009_S1_L001 | 97956.989 |
| gene-PKNH_0320900 | gene-PKNH_0320900 | rRNA | pk | pk_rRNA | 104974-001-009_S1_L001 | 58482.134 |
| gene-PKNH_0321000 | gene-PKNH_0321000 | rRNA | pk | pk_rRNA | 104974-001-009_S1_L001 | 54071.123 |
| gene-PKNH_1001200 | gene-PKNH_1001200 | rRNA | pk | pk_rRNA | 104974-001-009_S1_L001 | 53881.108 |
| gene-PKNH_1001100 | gene-PKNH_1001100 | rRNA | pk | pk_rRNA | 104974-001-009_S1_L001 | 24293.746 |
| gene-RN7SK | RN7SK | NA | human | human | 104974-001-009_S1_L001 | 20907.443 |
| gene-PKNH_0935200 | gene-PKNH_0935200 | snRNA | pk | pk | 104974-001-009_S1_L001 | 18472.293 |
| gene-PKNH_1339200 | gene-PKNH_1339200 | ncRNA | pk | pk | 104974-001-009_S1_L001 | 12768.772 |
| gene-HBB | HBB | NA | human | human_globin | 104974-001-009_S1_L001 | 6401.856 |
| gene-FTL | FTL | NA | human | human | 104974-001-009_S1_L001 | 5156.650 |
| gene-UBB | UBB | NA | human | human | 104974-001-009_S1_L001 | 3804.390 |
| gene-ADIPOR1 | ADIPOR1 | NA | human | human | 104974-001-009_S1_L001 | 3777.510 |
| gene-RPPH1-2 | RPPH1 | NA | human | human | 104974-001-009_S1_L001 | 3718.740 |
| gene-RPPH1 | RPPH1 | NA | human | human | 104974-001-009_S1_L001 | 3692.364 |
| gene-RNY1 | RNY1 | NA | human | human | 104974-001-009_S1_L001 | 3687.490 |
| gene-RNA28SN4 | RNA28SN4 | NA | human | human_rRNA | 104974-001-009_S1_L001 | 3199.299 |
| gene-GYPC | GYPC | NA | human | human | 104974-001-009_S1_L001 | 3180.375 |
| gene-RPL21 | RPL21 | NA | human | human | 104974-001-009_S1_L001 | 2826.477 |
| gene-HBA2 | HBA2 | NA | human | human_globin | 104974-001-009_S1_L001 | 2678.665 |
| gene-RMRP | RMRP | NA | human | human | 104974-001-009_S1_L001 | 2494.538 |
| gene-UBA52 | UBA52 | NA | human | human | 104974-001-009_S1_L001 | 2312.951 |
| gene-PKNH_0418600 | gene-PKNH_0418600 | protein_coding | pk | pk | 104974-001-009_S1_L001 | 2094.309 |
| gene_id | gene_name | gene_biotype | species | gene_type | sample | counts |
|---|---|---|---|---|---|---|
| gene-PKNH_0321100 | gene-PKNH_0321100 | rRNA | pk | pk_rRNA | 104974-001-010_S1_L001 | 181501.107 |
| gene-PKNH_1001000 | gene-PKNH_1001000 | rRNA | pk | pk_rRNA | 104974-001-010_S1_L001 | 143352.902 |
| gene-RN7SL2 | RN7SL2 | NA | human | human | 104974-001-010_S1_L001 | 86488.492 |
| gene-PKNH_0320900 | gene-PKNH_0320900 | rRNA | pk | pk_rRNA | 104974-001-010_S1_L001 | 70146.903 |
| gene-PKNH_1001200 | gene-PKNH_1001200 | rRNA | pk | pk_rRNA | 104974-001-010_S1_L001 | 66180.869 |
| gene-RN7SL1 | RN7SL1 | NA | human | human | 104974-001-010_S1_L001 | 60353.922 |
| gene-PKNH_0321000 | gene-PKNH_0321000 | rRNA | pk | pk_rRNA | 104974-001-010_S1_L001 | 54735.717 |
| gene-PKNH_1001100 | gene-PKNH_1001100 | rRNA | pk | pk_rRNA | 104974-001-010_S1_L001 | 20313.046 |
| gene-PKNH_0935200 | gene-PKNH_0935200 | snRNA | pk | pk | 104974-001-010_S1_L001 | 16268.334 |
| gene-RN7SK | RN7SK | NA | human | human | 104974-001-010_S1_L001 | 13867.096 |
| gene-PKNH_1339200 | gene-PKNH_1339200 | ncRNA | pk | pk | 104974-001-010_S1_L001 | 9108.890 |
| gene-RNA28SN4 | RNA28SN4 | NA | human | human_rRNA | 104974-001-010_S1_L001 | 3993.278 |
| gene-HBA2 | HBA2 | NA | human | human_globin | 104974-001-010_S1_L001 | 3889.037 |
| gene-HBB | HBB | NA | human | human_globin | 104974-001-010_S1_L001 | 3806.526 |
| gene-ADIPOR1 | ADIPOR1 | NA | human | human | 104974-001-010_S1_L001 | 3607.743 |
| gene-FTL | FTL | NA | human | human | 104974-001-010_S1_L001 | 3459.588 |
| gene-HBA1 | HBA1 | NA | human | human_globin | 104974-001-010_S1_L001 | 3183.081 |
| gene-GYPC | GYPC | NA | human | human | 104974-001-010_S1_L001 | 2799.467 |
| gene-RPPH1 | RPPH1 | NA | human | human | 104974-001-010_S1_L001 | 2659.526 |
| gene-PKNH_0418600 | gene-PKNH_0418600 | protein_coding | pk | pk | 104974-001-010_S1_L001 | 2645.063 |
| gene-RPPH1-2 | RPPH1 | NA | human | human | 104974-001-010_S1_L001 | 2627.532 |
| gene-PKNH_1132600 | gene-PKNH_1132600 | protein_coding | pk | pk | 104974-001-010_S1_L001 | 2592.234 |
| gene-COX1 | COX1 | NA | human | human | 104974-001-010_S1_L001 | 2414.909 |
| gene-UBB | UBB | NA | human | human | 104974-001-010_S1_L001 | 2265.004 |
| gene-RMRP | RMRP | NA | human | human | 104974-001-010_S1_L001 | 2211.677 |
| gene_id | gene_name | gene_biotype | species | gene_type | sample | counts |
|---|---|---|---|---|---|---|
| gene-RN7SL2 | RN7SL2 | NA | human | human | 104974-001-011_S1_L001 | 161042.174 |
| gene-PKNH_0321100 | gene-PKNH_0321100 | rRNA | pk | pk_rRNA | 104974-001-011_S1_L001 | 147935.000 |
| gene-PKNH_1001000 | gene-PKNH_1001000 | rRNA | pk | pk_rRNA | 104974-001-011_S1_L001 | 115582.956 |
| gene-RN7SL1 | RN7SL1 | NA | human | human | 104974-001-011_S1_L001 | 108220.532 |
| gene-PKNH_0320900 | gene-PKNH_0320900 | rRNA | pk | pk_rRNA | 104974-001-011_S1_L001 | 61256.956 |
| gene-PKNH_1001200 | gene-PKNH_1001200 | rRNA | pk | pk_rRNA | 104974-001-011_S1_L001 | 55243.977 |
| gene-PKNH_0321000 | gene-PKNH_0321000 | rRNA | pk | pk_rRNA | 104974-001-011_S1_L001 | 42015.846 |
| gene-PKNH_1001100 | gene-PKNH_1001100 | rRNA | pk | pk_rRNA | 104974-001-011_S1_L001 | 18712.764 |
| gene-RN7SK | RN7SK | NA | human | human | 104974-001-011_S1_L001 | 10855.497 |
| gene-PKNH_0935200 | gene-PKNH_0935200 | snRNA | pk | pk | 104974-001-011_S1_L001 | 10217.904 |
| gene-PKNH_1339200 | gene-PKNH_1339200 | ncRNA | pk | pk | 104974-001-011_S1_L001 | 7907.086 |
| gene-FTL | FTL | NA | human | human | 104974-001-011_S1_L001 | 6860.040 |
| gene-HBB | HBB | NA | human | human_globin | 104974-001-011_S1_L001 | 5702.090 |
| gene-ADIPOR1 | ADIPOR1 | NA | human | human | 104974-001-011_S1_L001 | 5452.125 |
| gene-UBB | UBB | NA | human | human | 104974-001-011_S1_L001 | 5000.284 |
| gene-GYPC | GYPC | NA | human | human | 104974-001-011_S1_L001 | 4800.788 |
| gene-RNA28SN4 | RNA28SN4 | NA | human | human_rRNA | 104974-001-011_S1_L001 | 4511.897 |
| gene-RPL21 | RPL21 | NA | human | human | 104974-001-011_S1_L001 | 4355.019 |
| gene-RPPH1-2 | RPPH1 | NA | human | human | 104974-001-011_S1_L001 | 3979.430 |
| gene-RPPH1 | RPPH1 | NA | human | human | 104974-001-011_S1_L001 | 3978.027 |
| gene-SLC25A39 | SLC25A39 | NA | human | human | 104974-001-011_S1_L001 | 3085.912 |
| gene-RNY1 | RNY1 | NA | human | human | 104974-001-011_S1_L001 | 2933.827 |
| gene-UBA52 | UBA52 | NA | human | human | 104974-001-011_S1_L001 | 2915.853 |
| gene-OAZ1 | OAZ1 | NA | human | human | 104974-001-011_S1_L001 | 2894.726 |
| gene-HBA2 | HBA2 | NA | human | human_globin | 104974-001-011_S1_L001 | 2852.671 |
| gene_id | gene_name | gene_biotype | species | gene_type | sample | counts |
|---|---|---|---|---|---|---|
| gene-HBB | HBB | NA | human | human_globin | 104974-001-012_S1_L001 | 366480.180 |
| gene-HBA2 | HBA2 | NA | human | human_globin | 104974-001-012_S1_L001 | 263245.083 |
| gene-HBA1 | HBA1 | NA | human | human_globin | 104974-001-012_S1_L001 | 98009.922 |
| gene-B2M | B2M | NA | human | human | 104974-001-012_S1_L001 | 6733.582 |
| gene-RNR2 | RNR2 | NA | human | human_rRNA | 104974-001-012_S1_L001 | 5784.927 |
| gene-S100A9 | S100A9 | NA | human | human | 104974-001-012_S1_L001 | 5749.014 |
| gene-S100A8 | S100A8 | NA | human | human | 104974-001-012_S1_L001 | 4593.144 |
| gene-RNA28SN4 | RNA28SN4 | NA | human | human_rRNA | 104974-001-012_S1_L001 | 4190.916 |
| gene-LOC124905315 | LOC124905315 | NA | human | human | 104974-001-012_S1_L001 | 3825.278 |
| gene-RNR1 | RNR1 | NA | human | human_rRNA | 104974-001-012_S1_L001 | 3549.196 |
| gene-TMSB4X | TMSB4X | NA | human | human | 104974-001-012_S1_L001 | 3347.079 |
| gene-RPS27 | RPS27 | NA | human | human | 104974-001-012_S1_L001 | 3045.938 |
| gene-FTL | FTL | NA | human | human | 104974-001-012_S1_L001 | 3016.675 |
| gene-ND3 | ND3 | NA | human | human | 104974-001-012_S1_L001 | 2750.724 |
| gene-COX3 | COX3 | NA | human | human | 104974-001-012_S1_L001 | 2577.031 |
| gene-RPL41 | RPL41 | NA | human | human | 104974-001-012_S1_L001 | 2375.402 |
| gene-RPS12 | RPS12 | NA | human | human | 104974-001-012_S1_L001 | 2212.215 |
| gene-COX2 | COX2 | NA | human | human | 104974-001-012_S1_L001 | 2207.033 |
| gene-RPL39 | RPL39 | NA | human | human | 104974-001-012_S1_L001 | 2132.982 |
| gene-ATP8 | ATP8 | NA | human | human | 104974-001-012_S1_L001 | 1919.579 |
| gene-EEF1A1 | EEF1A1 | NA | human | human | 104974-001-012_S1_L001 | 1903.976 |
| gene-RNA28SN2 | RNA28SN2 | NA | human | human_rRNA | 104974-001-012_S1_L001 | 1875.808 |
| gene-ATP6 | ATP6 | NA | human | human | 104974-001-012_S1_L001 | 1867.009 |
| gene-RNA18SN2 | RNA18SN2 | NA | human | human_rRNA | 104974-001-012_S1_L001 | 1677.114 |
| gene-RNA18SN3 | RNA18SN3 | NA | human | human_rRNA | 104974-001-012_S1_L001 | 1671.307 |
| gene_id | gene_name | gene_biotype | species | gene_type | sample | counts |
|---|---|---|---|---|---|---|
| gene-HBB | HBB | NA | human | human_globin | 104974-001-013_S1_L001 | 388222.402 |
| gene-HBA2 | HBA2 | NA | human | human_globin | 104974-001-013_S1_L001 | 289856.854 |
| gene-HBA1 | HBA1 | NA | human | human_globin | 104974-001-013_S1_L001 | 103444.535 |
| gene-S100A9 | S100A9 | NA | human | human | 104974-001-013_S1_L001 | 6019.643 |
| gene-B2M | B2M | NA | human | human | 104974-001-013_S1_L001 | 5886.942 |
| gene-RNR2 | RNR2 | NA | human | human_rRNA | 104974-001-013_S1_L001 | 5240.822 |
| gene-S100A8 | S100A8 | NA | human | human | 104974-001-013_S1_L001 | 4629.461 |
| gene-RNR1 | RNR1 | NA | human | human_rRNA | 104974-001-013_S1_L001 | 3265.530 |
| gene-FTL | FTL | NA | human | human | 104974-001-013_S1_L001 | 3136.281 |
| gene-TMSB4X | TMSB4X | NA | human | human | 104974-001-013_S1_L001 | 2718.495 |
| gene-RNA28SN4 | RNA28SN4 | NA | human | human_rRNA | 104974-001-013_S1_L001 | 2457.827 |
| gene-COX3 | COX3 | NA | human | human | 104974-001-013_S1_L001 | 2138.235 |
| gene-ND3 | ND3 | NA | human | human | 104974-001-013_S1_L001 | 2125.318 |
| gene-RPS27 | RPS27 | NA | human | human | 104974-001-013_S1_L001 | 2021.781 |
| gene-RPS12 | RPS12 | NA | human | human | 104974-001-013_S1_L001 | 1910.314 |
| gene-RPL41 | RPL41 | NA | human | human | 104974-001-013_S1_L001 | 1851.973 |
| gene-LOC124905315 | LOC124905315 | NA | human | human | 104974-001-013_S1_L001 | 1741.411 |
| gene-COX2 | COX2 | NA | human | human | 104974-001-013_S1_L001 | 1678.269 |
| gene-FTH1 | FTH1 | NA | human | human | 104974-001-013_S1_L001 | 1635.636 |
| gene-RPL39 | RPL39 | NA | human | human | 104974-001-013_S1_L001 | 1558.862 |
| gene-ATP8 | ATP8 | NA | human | human | 104974-001-013_S1_L001 | 1532.902 |
| gene-EEF1A1 | EEF1A1 | NA | human | human | 104974-001-013_S1_L001 | 1446.312 |
| gene-ATP6 | ATP6 | NA | human | human | 104974-001-013_S1_L001 | 1419.527 |
| gene-RPL21 | RPL21 | NA | human | human | 104974-001-013_S1_L001 | 1331.099 |
| gene-COX1 | COX1 | NA | human | human | 104974-001-013_S1_L001 | 1240.701 |
| gene_id | gene_name | gene_biotype | species | gene_type | sample | counts |
|---|---|---|---|---|---|---|
| gene-HBB | HBB | NA | human | human_globin | 104974-001-014_S1_L001 | 523264.4084 |
| gene-HBA2 | HBA2 | NA | human | human_globin | 104974-001-014_S1_L001 | 284480.8538 |
| gene-HBA1 | HBA1 | NA | human | human_globin | 104974-001-014_S1_L001 | 114371.1571 |
| gene-LOC124905315 | LOC124905315 | NA | human | human | 104974-001-014_S1_L001 | 2918.1146 |
| gene-RNA28SN4 | RNA28SN4 | NA | human | human_rRNA | 104974-001-014_S1_L001 | 2536.0602 |
| gene-FTL | FTL | NA | human | human | 104974-001-014_S1_L001 | 1871.6873 |
| gene-RNA18SN3-2 | RNA18SN3 | NA | human | human_rRNA | 104974-001-014_S1_L001 | 1543.4707 |
| gene-RNA18SN4 | RNA18SN4 | NA | human | human_rRNA | 104974-001-014_S1_L001 | 1542.5888 |
| gene-RNA18SN3 | RNA18SN3 | NA | human | human_rRNA | 104974-001-014_S1_L001 | 1542.4997 |
| gene-RNA18SN2 | RNA18SN2 | NA | human | human_rRNA | 104974-001-014_S1_L001 | 1535.9067 |
| gene-RPS12 | RPS12 | NA | human | human | 104974-001-014_S1_L001 | 1366.6827 |
| gene-RNA28SN2 | RNA28SN2 | NA | human | human_rRNA | 104974-001-014_S1_L001 | 1296.8516 |
| gene-RNR2 | RNR2 | NA | human | human_rRNA | 104974-001-014_S1_L001 | 1166.9362 |
| gene-UBA52 | UBA52 | NA | human | human | 104974-001-014_S1_L001 | 1068.1666 |
| gene-RPL21 | RPL21 | NA | human | human | 104974-001-014_S1_L001 | 1028.5515 |
| gene-UBB | UBB | NA | human | human | 104974-001-014_S1_L001 | 892.5544 |
| gene-GYPC | GYPC | NA | human | human | 104974-001-014_S1_L001 | 851.2235 |
| gene-HBG2 | HBG2 | NA | human | human_globin | 104974-001-014_S1_L001 | 842.1872 |
| gene-FKBP8 | FKBP8 | NA | human | human | 104974-001-014_S1_L001 | 814.0800 |
| gene-B2M | B2M | NA | human | human | 104974-001-014_S1_L001 | 783.0767 |
| gene-RPL41 | RPL41 | NA | human | human | 104974-001-014_S1_L001 | 715.1521 |
| gene-RNA5-8SN4 | RNA5-8SN4 | NA | human | human_rRNA | 104974-001-014_S1_L001 | 636.0227 |
| gene-OAZ1 | OAZ1 | NA | human | human | 104974-001-014_S1_L001 | 599.3413 |
| gene-S100A9 | S100A9 | NA | human | human | 104974-001-014_S1_L001 | 578.5453 |
| gene-SLC25A39 | SLC25A39 | NA | human | human | 104974-001-014_S1_L001 | 555.1666 |
| gene_id | gene_name | gene_biotype | species | gene_type | sample | counts |
|---|---|---|---|---|---|---|
| gene-HBB | HBB | NA | human | human_globin | 104974-001-015_S1_L001 | 448423.0722 |
| gene-HBA2 | HBA2 | NA | human | human_globin | 104974-001-015_S1_L001 | 316409.7173 |
| gene-HBA1 | HBA1 | NA | human | human_globin | 104974-001-015_S1_L001 | 127791.9575 |
| gene-LOC124905315 | LOC124905315 | NA | human | human | 104974-001-015_S1_L001 | 3596.4548 |
| gene-RNA28SN4 | RNA28SN4 | NA | human | human_rRNA | 104974-001-015_S1_L001 | 3411.2697 |
| gene-RNA18SN2 | RNA18SN2 | NA | human | human_rRNA | 104974-001-015_S1_L001 | 1762.2669 |
| gene-RNA18SN4 | RNA18SN4 | NA | human | human_rRNA | 104974-001-015_S1_L001 | 1756.8958 |
| gene-RNA18SN3 | RNA18SN3 | NA | human | human_rRNA | 104974-001-015_S1_L001 | 1753.9691 |
| gene-RNA18SN3-2 | RNA18SN3 | NA | human | human_rRNA | 104974-001-015_S1_L001 | 1750.1382 |
| gene-FTL | FTL | NA | human | human | 104974-001-015_S1_L001 | 1660.3870 |
| gene-RNA28SN2 | RNA28SN2 | NA | human | human_rRNA | 104974-001-015_S1_L001 | 1615.4719 |
| gene-RPS12 | RPS12 | NA | human | human | 104974-001-015_S1_L001 | 1099.8989 |
| gene-UBA52 | UBA52 | NA | human | human | 104974-001-015_S1_L001 | 1020.6326 |
| gene-PKNH_0418600 | gene-PKNH_0418600 | protein_coding | pk | pk | 104974-001-015_S1_L001 | 925.7916 |
| gene-FKBP8 | FKBP8 | NA | human | human | 104974-001-015_S1_L001 | 917.5296 |
| gene-GYPC | GYPC | NA | human | human | 104974-001-015_S1_L001 | 858.6056 |
| gene-B2M | B2M | NA | human | human | 104974-001-015_S1_L001 | 792.5917 |
| gene-RPL21 | RPL21 | NA | human | human | 104974-001-015_S1_L001 | 779.4010 |
| gene-RNR2 | RNR2 | NA | human | human_rRNA | 104974-001-015_S1_L001 | 768.8011 |
| gene-UBB | UBB | NA | human | human | 104974-001-015_S1_L001 | 734.8697 |
| gene-S100A9 | S100A9 | NA | human | human | 104974-001-015_S1_L001 | 692.6973 |
| gene-HBG2 | HBG2 | NA | human | human_globin | 104974-001-015_S1_L001 | 663.0136 |
| gene-OAZ1 | OAZ1 | NA | human | human | 104974-001-015_S1_L001 | 576.5540 |
| gene-SLC25A39 | SLC25A39 | NA | human | human | 104974-001-015_S1_L001 | 567.3799 |
| gene-RPL41 | RPL41 | NA | human | human | 104974-001-015_S1_L001 | 566.2701 |
| gene_id | gene_name | gene_biotype | species | gene_type | sample | counts |
|---|---|---|---|---|---|---|
| gene-HBB | HBB | NA | human | human_globin | 104974-001-016_S1_L001 | 514171.8546 |
| gene-HBA2 | HBA2 | NA | human | human_globin | 104974-001-016_S1_L001 | 301490.6876 |
| gene-HBA1 | HBA1 | NA | human | human_globin | 104974-001-016_S1_L001 | 122126.6477 |
| gene-LOC124905315 | LOC124905315 | NA | human | human | 104974-001-016_S1_L001 | 3512.6965 |
| gene-RNA28SN4 | RNA28SN4 | NA | human | human_rRNA | 104974-001-016_S1_L001 | 3033.8382 |
| gene-RNA18SN2 | RNA18SN2 | NA | human | human_rRNA | 104974-001-016_S1_L001 | 2096.6635 |
| gene-RNA18SN3 | RNA18SN3 | NA | human | human_rRNA | 104974-001-016_S1_L001 | 2086.6004 |
| gene-RNA18SN4 | RNA18SN4 | NA | human | human_rRNA | 104974-001-016_S1_L001 | 2073.9575 |
| gene-RNA18SN3-2 | RNA18SN3 | NA | human | human_rRNA | 104974-001-016_S1_L001 | 2066.1062 |
| gene-FTL | FTL | NA | human | human | 104974-001-016_S1_L001 | 1780.1712 |
| gene-RNA28SN2 | RNA28SN2 | NA | human | human_rRNA | 104974-001-016_S1_L001 | 1534.3552 |
| gene-RPS12 | RPS12 | NA | human | human | 104974-001-016_S1_L001 | 1288.8221 |
| gene-UBA52 | UBA52 | NA | human | human | 104974-001-016_S1_L001 | 1087.6524 |
| gene-RPL21 | RPL21 | NA | human | human | 104974-001-016_S1_L001 | 990.7414 |
| gene-RNR2 | RNR2 | NA | human | human_rRNA | 104974-001-016_S1_L001 | 990.5111 |
| gene-UBB | UBB | NA | human | human | 104974-001-016_S1_L001 | 922.4129 |
| gene-HBG2 | HBG2 | NA | human | human_globin | 104974-001-016_S1_L001 | 911.4972 |
| gene-GYPC | GYPC | NA | human | human | 104974-001-016_S1_L001 | 883.1664 |
| gene-FKBP8 | FKBP8 | NA | human | human | 104974-001-016_S1_L001 | 818.6246 |
| gene-OAZ1 | OAZ1 | NA | human | human | 104974-001-016_S1_L001 | 600.4898 |
| gene-RPL41 | RPL41 | NA | human | human | 104974-001-016_S1_L001 | 559.7779 |
| gene-SLC25A39 | SLC25A39 | NA | human | human | 104974-001-016_S1_L001 | 555.5004 |
| gene-RNA5-8SN4 | RNA5-8SN4 | NA | human | human_rRNA | 104974-001-016_S1_L001 | 525.1435 |
| gene-RN7SL2 | RN7SL2 | NA | human | human | 104974-001-016_S1_L001 | 410.6589 |
| gene-PKNH_0418600 | gene-PKNH_0418600 | protein_coding | pk | pk | 104974-001-016_S1_L001 | 380.8039 |
Also see figure in results-refseq/fastq-screen/ and results-refseq/multiqc-fastq-screen-samtools-pk/multiqc_report.html.
fastq_screen_transformed <- bind_rows(
fastq_screen %>%
mutate(`%_single_genome` = `%_One_hit_one_genome` + `%_Multiple_hits_one_genome`) %>%
mutate(`%_multiple_genomes` = `%_One_hit_multiple_genomes` + `%_Multiple_hits_multiple_genomes`) %>%
mutate(`No_hits` = `%_One_hit_multiple_genomes` + `%_Multiple_hits_multiple_genomes`) %>%
select(c(sample, Genome, `#Reads_processed`, `%_single_genome`, `%_multiple_genomes`)) %>%
pivot_wider(names_from = Genome, values_from = c(`%_single_genome`, `%_multiple_genomes`)) %>%
select(!`%_multiple_genomes_PknowlesiH`) %>%
rename(`%_multiple_genomes` = `%_multiple_genomes_Human`) %>%
mutate(pct = 1 - `%_single_genome_PknowlesiH` - `%_single_genome_Human` - `%_multiple_genomes`) %>%
select(sample, pct) %>%
mutate("Mapped reads" = "No hits"),
fastq_screen %>%
mutate(pct = `%_One_hit_one_genome` + `%_Multiple_hits_one_genome`) %>%
select(c(sample, Genome, pct)) %>%
rename("Mapped reads" = Genome),
fastq_screen %>%
mutate(pct = `%_One_hit_multiple_genomes` + `%_Multiple_hits_multiple_genomes`) %>%
select(c(sample, pct)) %>%
distinct(sample, .keep_all = TRUE) %>%
mutate("Mapped reads" = "Multiple genomes")
) %>%
# change order of levels here to adjust colours
mutate(`Mapped reads` = factor(`Mapped reads`,
levels = c("PknowlesiH", "Human", "Multiple genomes", "No hits"),
labels = c("P. knowlesi", "Human", "Multiple hits", "No hits")
)) %>%
left_join(samples, by = "sample")
ggplot(fastq_screen_transformed, aes(x = sample_short, y = pct, fill = `Mapped reads`)) +
geom_bar(position = "stack", stat = "identity") +
scale_fill_viridis(option = "viridis", discrete = T, limits = rev) +
coord_flip() +
scale_x_discrete(limits = rev) +
labs(fill = expression("FastQ Screen mapped reads"), x = "Sample") +
ylab(label = "% of mapped reads")
# Human reads
fastq_screen %>%
pivot_longer(
cols = starts_with("%"),
names_to = "fastq_screen_pct_type",
values_to = "fastq_screen_pct_value"
) %>%
filter(Genome == "Human") %>%
left_join(samples) %>%
ggplot(aes(x = sample_short, y = fastq_screen_pct_value, fill = fastq_screen_pct_type)) +
geom_bar(position = "stack", stat = "identity") +
scale_fill_viridis(option = "viridis", discrete = T) +
coord_flip() +
scale_x_discrete(limits = rev) +
labs(fill = "FastQ screen statistics for human reads", x = "Sample", y = "% of mapped reads")
## Joining with `by = join_by(sample)`
# Pk reads
fastq_screen %>%
pivot_longer(
cols = starts_with("%"),
names_to = "fastq_screen_pct_type",
values_to = "fastq_screen_pct_value"
) %>%
filter(Genome == "PknowlesiH") %>%
left_join(samples) %>%
ggplot(aes(x = sample_short, y = fastq_screen_pct_value, fill = fastq_screen_pct_type)) +
geom_bar(position = "stack", stat = "identity") +
scale_fill_viridis(option = "viridis", discrete = T) +
coord_flip() +
scale_x_discrete(limits = rev) +
labs(fill = expression("FastQ screen statistics for" ~ italic("P. knowlesi") ~ "reads"), x = "Sample", y = "% of mapped reads")
## Joining with `by = join_by(sample)`
gene_annotation %>%
# select rRNA and globins
filter(!gene_type %in% c("human", "pk")) %>%
left_join(gene_tpm_filtered[1:18], by = c("gene_id", "gene_name")) %>%
filter(rowSums(across(.cols = `104974-001-001_S1_L001`:`104974-001-016_S1_L001`)) != 0) %>%
pivot_longer(
cols = `104974-001-001_S1_L001`:`104974-001-016_S1_L001`,
names_to = "sample",
values_to = "counts"
) %>%
left_join(samples, by = "sample") %>%
group_by(gene_id) %>%
mutate(total = sum(counts)) %>%
ungroup() %>%
arrange(species, gene_type, desc(total)) %>%
mutate(gene_name = factor(gene_name, levels = unique(gene_name))) %>%
ggplot(aes(x = gene_name, y = counts, fill = sample_short)) +
geom_bar(position = "stack", stat = "identity") +
scale_fill_viridis(option = "viridis", discrete = T) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
coord_flip() +
scale_x_discrete(limits = rev) +
labs(fill = "Sample") +
ylab(label = "TPM") +
xlab(label = "Gene name") +
facet_wrap(~kit_name) +
scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale()))
Log TPM is often plotted instead of raw values, but can be misleading on barplot…
gene_annotation %>%
filter(!gene_type %in% c("human", "pk")) %>%
left_join(gene_tpm_filtered[1:18], by = c("gene_id", "gene_name")) %>%
filter(rowSums(across(.cols = `104974-001-001_S1_L001`:`104974-001-016_S1_L001`)) != 0) %>%
pivot_longer(
cols = `104974-001-001_S1_L001`:`104974-001-016_S1_L001`,
names_to = "sample",
values_to = "counts"
) %>%
left_join(samples, by = "sample") %>%
group_by(gene_id) %>%
mutate(total = sum(counts)) %>%
ungroup() %>%
arrange(species, gene_type, desc(total)) %>%
mutate(gene_name = factor(gene_name, levels = unique(gene_name))) %>%
ggplot(aes(x = gene_name, y = counts)) +
geom_point(aes(colour = sample_name)) +
scale_colour_viridis(option = "viridis", discrete = T) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
coord_flip() +
scale_x_discrete(limits = rev) +
# labs(colour = "Sample") +
labs(x = "Gene name", y = "log(TPM+1)", colour = "Sample") +
facet_wrap(~sample_name) +
scale_y_continuous(trans = "log1p")
gene_annotation %>%
filter(gene_type == "human_globin") %>%
left_join(gene_tpm_filtered[1:18], by = c("gene_id", "gene_name")) %>%
filter(rowSums(across(.cols = `104974-001-001_S1_L001`:`104974-001-016_S1_L001`)) != 0) %>%
pivot_longer(
cols = `104974-001-001_S1_L001`:`104974-001-016_S1_L001`,
names_to = "sample",
values_to = "counts"
) %>%
left_join(samples, by = "sample") %>%
group_by(gene_id) %>%
mutate(total = sum(counts)) %>%
ungroup() %>%
arrange(species, gene_type, desc(total)) %>%
mutate(gene_name = factor(gene_name, levels = unique(gene_name))) %>%
ggplot(aes(x = gene_name, y = counts)) +
geom_point(aes(colour = sample)) +
scale_colour_viridis(option = "viridis", discrete = T) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
coord_flip() +
scale_x_discrete(limits = rev) +
labs(x = "Gene name", y = "TPM", colour = "Sample") +
facet_wrap(~sample_name) +
scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale()))
gene_annotation %>%
filter(gene_type == "human_rRNA") %>%
left_join(gene_tpm_filtered[1:18], by = c("gene_id", "gene_name")) %>%
filter(rowSums(across(.cols = `104974-001-001_S1_L001`:`104974-001-016_S1_L001`)) != 0) %>%
pivot_longer(
cols = `104974-001-001_S1_L001`:`104974-001-016_S1_L001`,
names_to = "sample",
values_to = "counts"
) %>%
left_join(samples, by = "sample") %>%
group_by(gene_id) %>%
mutate(total = sum(counts)) %>%
ungroup() %>%
arrange(species, gene_type, desc(total)) %>%
mutate(gene_name = factor(gene_name, levels = unique(gene_name))) %>%
ggplot(aes(x = gene_name, y = counts)) +
geom_point(aes(colour = sample)) +
scale_colour_viridis(option = "viridis", discrete = T) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
coord_flip() +
scale_x_discrete(limits = rev) +
labs(x = "Gene name", y = "TPM", colour = "Sample") +
facet_wrap(~sample_name) +
scale_y_continuous(trans = "log1p")
TPM values appear to be the most appropriate metric to use.
These gene counts were aggregated across all samples (tximport of transcript counts) and filtered to only genes with a minimum count of 10 reads for at least 1 sample.
gene_counts_txi_filtered_long %>%
left_join(samples, by = "sample") %>%
group_by(sample_name, gene_type) %>%
summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
ggplot(aes(y = sum_reads, x = sample_name, fill = gene_type)) +
geom_bar(position = "stack", stat = "identity") +
scale_fill_viridis(
option = "viridis", discrete = T,
# limits = rev,
labels = c("human", "human globin", "human rRNA", expression(italic("P. knowlesi")), expression(italic("P. knowlesi rRNA")))
) +
coord_flip() +
scale_x_discrete(limits = rev) +
scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
labs(x = "Sample", y = "Sum of gene counts", fill = "RNA type")
Filtered to only genes with a minimum count of 10 reads for at least 1 sample.
gene_tpm_filtered_long %>%
left_join(samples, by = "sample") %>%
group_by(sample_name, gene_type) %>%
summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
ggplot(aes(y = sum_reads, x = sample_name, fill = gene_type)) +
geom_bar(position = "stack", stat = "identity") +
scale_fill_viridis(
option = "viridis", discrete = T,
# limits = rev,
labels = c("human", "human globin", "human rRNA", expression(italic("P. knowlesi")), expression(italic("P. knowlesi rRNA")))
) +
coord_flip() +
scale_x_discrete(limits = rev) +
scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
labs(x = "Sample", y = "Sum of TPM counts", fill = "RNA type")
These gene counts were aggregated across all samples (tximport of transcript counts) and filtered to only genes with a minimum count of 10 reads for at least 1 sample.
# absolute numbers
gene_counts_txi_filtered_long %>%
left_join(samples, by = "sample") %>%
group_by(sample_name, species) %>%
summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
ggplot(aes(y = sum_reads, x = sample_name, fill = species)) +
geom_bar(position = "stack", stat = "identity") +
scale_fill_viridis(
option = "viridis", discrete = T,
# limits = rev
) +
coord_flip() +
scale_x_discrete(limits = rev) +
scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
labs(x = "Sample", y = "Sum of gene counts", fill = "Species")
# proportion
gene_counts_txi_filtered_long %>%
left_join(samples, by = "sample") %>%
group_by(sample_name, species) %>%
summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
ggplot(aes(y = sum_reads, x = sample_name, fill = species)) +
geom_bar(position = "fill", stat = "identity") +
scale_fill_viridis(
option = "viridis", discrete = T,
# limits = rev
) +
coord_flip() +
scale_x_discrete(limits = rev) +
scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
labs(x = "Sample", y = "Proportion of gene counts", fill = "Species")
Note that since TPM is already scaled by library size, the absolute and proportion plots are identical.
# absolute numbers
gene_tpm_filtered_long %>%
left_join(samples, by = "sample") %>%
group_by(sample_name, species) %>%
summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
ggplot(aes(y = sum_reads, x = sample_name, fill = species)) +
geom_bar(position = "stack", stat = "identity") +
scale_fill_viridis(
option = "viridis", discrete = T,
# limits = rev
) +
coord_flip() +
scale_x_discrete(limits = rev) +
scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
labs(x = "Sample", y = "Sum of TPM counts", fill = "Species")
# proportion
gene_tpm_long %>%
left_join(samples, by = "sample") %>%
group_by(sample_name, species) %>%
summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
ggplot(aes(y = sum_reads, x = sample_name, fill = species)) +
geom_bar(position = "fill", stat = "identity") +
scale_fill_viridis(
option = "viridis", discrete = T,
# limits = rev
) +
coord_flip() +
scale_x_discrete(limits = rev) +
scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
labs(x = "Sample", y = "Proportion of TPM counts", fill = "Species")
# create binary matrices with presence/absence of genes
gene_tpm_filtered_binary <- gene_tpm_filtered %>%
mutate(across(.cols = `104974-001-001_S1_L001`:`104974-001-016_S1_L001`, .fns = ~ .x > 0)) %>%
select(!c(gene_name, gene_type, species))
gene_tpm_pk_filtered_binary <- gene_tpm_filtered %>%
mutate(across(.cols = `104974-001-001_S1_L001`:`104974-001-016_S1_L001`, .fns = ~ .x > 0)) %>%
filter(species == "pk") %>%
select(!c(gene_name, gene_type, species))
gene_tpm_filtered_binary_list <- purrr::set_names(colnames(gene_tpm_filtered_binary %>% select(`104974-001-001_S1_L001`:`104974-001-016_S1_L001`))) %>%
purrr::imap(~ gene_tpm_filtered_binary$gene_id[gene_tpm_filtered_binary[, .x, drop = T]])
gene_tpm_pk_filtered_binary_list <- purrr::set_names(colnames(gene_tpm_pk_filtered_binary %>% select(`104974-001-001_S1_L001`:`104974-001-016_S1_L001`))) %>%
purrr::imap(~ gene_tpm_pk_filtered_binary$gene_id[gene_tpm_pk_filtered_binary[, .x, drop = T]])
comparison_centpip <- samples %>%
filter(WBC_depletion == "centpip") %>%
select(sample)
comparison_plasmo <- samples %>%
filter(WBC_depletion == "plasmo") %>%
select(sample)
comparison_pmacs <- samples %>%
filter(WBC_depletion == "pmacs") %>%
select(sample)
comparison_cellulose <- samples %>%
filter(WBC_depletion == "cellulose") %>%
select(sample)
comparison_nofilter <- samples %>%
filter(WBC_depletion == "nofilter") %>%
filter(!sample == "104974-001-006_S1_L001")
comparison_mrna_qiagen <- samples %>%
filter(kit == "mRNA_qiagen_FSglob")
comparison_trna_qiagen <- samples %>%
filter(kit == "tRNA_qiagen_FSglobH")
comparison_mrna_illumina <- samples %>%
filter(kit == "mRNA_illumina")
draw_venn <- function(set_list, names, title, output) {
VennDiagram::venn.diagram(
x = set_list,
category.names = names,
# Circles
lwd = 2,
# lty = "blank",
# col = palette.colors(3),
# fill = palette.colors(3),
# col = hcl.colors(3, palette = "viridis", alpha = 0.9),
# fill = hcl.colors(3, palette = "viridis", alpha = 0.4),
col = hcl.colors(3, palette = "Set 2", alpha = 0.9),
fill = hcl.colors(3, palette = "Set 2", alpha = 0.4),
# Numbers
cex = .9,
fontface = "bold",
fontfamily = "sans",
# Set names
cat.cex = 0.9,
cat.fontface = "bold",
cat.default.pos = "outer",
cat.pos = c(-20, 20, 135),
cat.dist = c(0.055, 0.055, 0.11),
cat.fontfamily = "sans",
rotation = 1,
# title
main = title,
main.cex = 1.1,
main.fontface = "bold",
main.fontfamily = "sans",
# output features
height = 1200,
width = 1200,
resolution = 300,
compression = "lzw",
filename = output,
output = TRUE,
disable.logging = TRUE
)
}
Gene counts were filtered on a minimum (raw) count of 10 for at least 1 sample before checking for presence/absence.
v <- draw_venn(
gene_tpm_filtered_binary_list[comparison_centpip$sample],
c("mRNA Qiagen FS globin", "tRNA Qiagen FS globin/rRNA", "mRNA Illumina"),
"Centpip",
NULL
# here("results-refseq/figures/venn-centpip.png")
)
grid.newpage()
grid.draw(v)
v <- draw_venn(
gene_tpm_filtered_binary_list[comparison_plasmo$sample],
c("mRNA Qiagen FS globin", "tRNA Qiagen FS globin/rRNA", "mRNA Illumina"),
"Plasmo",
NULL
# here("results-refseq/figures/venn-plasmo.png")
)
grid.newpage()
grid.draw(v)
v <- draw_venn(
gene_tpm_filtered_binary_list[comparison_pmacs$sample],
c("mRNA Qiagen FS globin", "tRNA Qiagen FS globin/rRNA", "mRNA Illumina"),
"PMACS",
NULL
# here("results-refseq/figures/venn-pmacs.png")
)
grid.newpage()
grid.draw(v)
v <- draw_venn(
gene_tpm_filtered_binary_list[comparison_cellulose$sample],
c("mRNA Qiagen FS globin", "tRNA Qiagen FS globin/rRNA", "mRNA Illumina"),
"Cellulose",
NULL
# here("results-refseq/figures/venn-cellulose.png")
)
grid.newpage()
grid.draw(v)
v <- draw_venn(
gene_tpm_filtered_binary_list[comparison_nofilter$sample],
c("mRNA Qiagen FS globin", "tRNA Qiagen FS globin/rRNA", "mRNA Illumina"),
"Cellulose",
NULL
# here("results-refseq/figures/venn-nofilter.png")
)
grid.newpage()
grid.draw(v)
Gene counts were filtered on a minimum (raw) count of 10 for at least 1 sample before checking for presence/absence.
v <- draw_venn(
gene_tpm_pk_filtered_binary_list[comparison_centpip$sample],
c("mRNA Qiagen FS globin", "tRNA Qiagen FS globin/rRNA", "mRNA Illumina"),
"Centpip",
NULL
# here("results-refseq/figures/venn-centpip_pk.png")
)
grid.newpage()
grid.draw(v)
v <- draw_venn(
gene_tpm_pk_filtered_binary_list[comparison_plasmo$sample],
c("mRNA Qiagen FS globin", "tRNA Qiagen FS globin/rRNA", "mRNA Illumina"),
"Plasmo",
NULL
# here("results-refseq/figures/venn-plasmo_pk.png")
)
grid.newpage()
grid.draw(v)
v <- draw_venn(
gene_tpm_pk_filtered_binary_list[comparison_pmacs$sample],
c("mRNA Qiagen FS globin", "tRNA Qiagen FS globin/rRNA", "mRNA Illumina"),
"PMACS",
NULL
# here("results-refseq/figures/venn-pmacs_pk.png")
)
grid.newpage()
grid.draw(v)
v <- draw_venn(
gene_tpm_pk_filtered_binary_list[comparison_cellulose$sample],
c("mRNA Qiagen FS globin", "tRNA Qiagen FS globin/rRNA", "mRNA Illumina"),
"Cellulose",
NULL
# here("results-refseq/figures/venn-cellulose_pk.png")
)
grid.newpage()
grid.draw(v)
v <- draw_venn(
gene_tpm_pk_filtered_binary_list[comparison_nofilter$sample],
c("mRNA Qiagen FS globin", "tRNA Qiagen FS globin/rRNA", "mRNA Illumina"),
"No filter",
NULL
# here("results-refseq/figures/venn-nofilter_pk.png")
)
grid.newpage()
grid.draw(v)
# Use upset plots instead of five-way venn diagram (alternatively check out proportional area venn)
# mRNA Qiagen
UpSetR::upset(
as.data.frame(gene_tpm_filtered_binary %>%
select((comparison_mrna_qiagen %>% select(sample) %>% pull())) %>%
rename(all_of(c(
"no filter" = "104974-001-001_S1_L001",
"cent/pip" = "104974-001-002_S1_L001",
"plasmodipur" = "104974-001-003_S1_L001",
"PMACS" = "104974-001-004_S1_L001",
"cellulose" = "104974-001-005_S1_L001"
))) %>%
mutate_all(as.integer)),
keep.order = TRUE,
sets.x.label = "Genes per filter",
mainbar.y.label = "Intersections for mRNA Qiagen",
order.by = "freq"
)
# tRNA Qiagen
UpSetR::upset(
as.data.frame(gene_tpm_filtered_binary %>%
select((comparison_trna_qiagen %>% select(sample) %>% pull())) %>%
rename(all_of(c(
"no filter" = "104974-001-007_S1_L001",
"cent/pip" = "104974-001-008_S1_L001",
"plasmodipur" = "104974-001-009_S1_L001",
"PMACS" = "104974-001-010_S1_L001",
"cellulose" = "104974-001-011_S1_L001"
))) %>%
mutate_all(as.integer)),
keep.order = TRUE,
sets.x.label = "Genes per filter",
mainbar.y.label = "Intersections for tRNA Qiagen",
order.by = "freq"
)
# mRNA Illumina
UpSetR::upset(
as.data.frame(gene_tpm_filtered_binary %>%
select((comparison_mrna_illumina %>% select(sample) %>% pull())) %>%
rename(all_of(c(
"no filter" = "104974-001-012_S1_L001",
"cent/pip" = "104974-001-013_S1_L001",
"plasmodipur" = "104974-001-014_S1_L001",
"PMACS" = "104974-001-015_S1_L001",
"cellulose" = "104974-001-016_S1_L001"
))) %>%
mutate_all(as.integer)),
keep.order = TRUE,
sets.x.label = "Genes per filter",
mainbar.y.label = "Intersections for mRNA Illumina",
order.by = "freq"
)
# mRNA Qiagen
UpSetR::upset(
as.data.frame(gene_tpm_pk_filtered_binary %>%
select((comparison_mrna_qiagen %>% select(sample) %>% pull())) %>%
rename(all_of(c(
"no filter" = "104974-001-001_S1_L001",
"cent/pip" = "104974-001-002_S1_L001",
"plasmodipur" = "104974-001-003_S1_L001",
"PMACS" = "104974-001-004_S1_L001",
"cellulose" = "104974-001-005_S1_L001"
))) %>%
mutate_all(as.integer)),
keep.order = TRUE,
sets.x.label = "Genes per filter",
mainbar.y.label = "Intersections for mRNA Qiagen",
order.by = "freq"
)
# tRNA Qiagen
UpSetR::upset(
as.data.frame(gene_tpm_pk_filtered_binary %>%
select((comparison_trna_qiagen %>% select(sample) %>% pull())) %>%
rename(all_of(c(
"no filter" = "104974-001-007_S1_L001",
"cent/pip" = "104974-001-008_S1_L001",
"plasmodipur" = "104974-001-009_S1_L001",
"PMACS" = "104974-001-010_S1_L001",
"cellulose" = "104974-001-011_S1_L001"
))) %>%
mutate_all(as.integer)),
keep.order = TRUE,
sets.x.label = "Genes per filter",
mainbar.y.label = "Intersections for tRNA Qiagen",
order.by = "freq"
)
# mRNA Illumina
UpSetR::upset(
as.data.frame(gene_tpm_pk_filtered_binary %>%
select((comparison_mrna_illumina %>% select(sample) %>% pull())) %>%
rename(all_of(c(
"no filter" = "104974-001-012_S1_L001",
"cent/pip" = "104974-001-013_S1_L001",
"plasmodipur" = "104974-001-014_S1_L001",
"PMACS" = "104974-001-015_S1_L001",
"cellulose" = "104974-001-016_S1_L001"
))) %>%
mutate_all(as.integer)),
keep.order = TRUE,
sets.x.label = "Genes per filter",
mainbar.y.label = "Intersections for mRNA Illumina",
order.by = "freq"
)
Which samples are most similar to each other in terms of gene
expression profile? Use Euclidean distance between samples on
rlog transformed gene counts (filtered to genes with a
minimum of 10 reads per sample).
heatmap_of_distances <- function(transformed_data, title) {
sampleDists <- dist(t(assay(transformed_data)))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- transformed_data$sample_short
colnames(sampleDistMatrix) <- NULL
# add grouping columns
anno <- as.data.frame(colData(transformed_data)[, c("kit_name", "filter_name")]) %>%
mutate(kit_name = as.factor(kit_name)) %>%
mutate(filter_name = as.factor(filter_name)) %>%
rename_with(~ str_to_title(str_replace(.x, "_", " ")))
# set names of samples
colnames(sampleDistMatrix) <- (transformed_data)$sample_short
rownames(anno) <- (transformed_data)$sample_short
# Adapted from: https://slowkow.com/notes/pheatmap-tutorial/
mat_colors <- list(
`Kit Name` = hcl.colors(length(unique(anno$`Kit Name`)), palette = "plasma", alpha = 1),
`Filter Name` = palette.colors(length(unique(anno$`Filter Name`)), palette = "Okabe-Ito", alpha = 1)
)
names(mat_colors$`Kit Name`) <- unique(anno$`Kit Name`)
names(mat_colors$`Filter Name`) <- unique(anno$`Filter Name`)
# create heatmap
p <- pheatmap(sampleDistMatrix,
annotation_col = anno,
annotation_colors = mat_colors,
clustering_distance_rows = sampleDists,
clustering_distance_cols = sampleDists,
color = viridis(
n = 256, alpha = 1,
begin = 0, end = 1, option = "viridis"
),
# main = "Sample-to-sample Euclidian distance based on all genes",
main = title,
scale = "none",
show_rownames = TRUE,
show_colnames=FALSE
) +
theme(plot.title = element_markdown())
return(p)
}
heatmap_of_distances(rld_filtered, str_wrap("Sample-to-sample Euclidian distance based on all genes - rlog", 40))
## NULL
The VST transform leads to slightly different results: the two large blocks remains the same, but the positioning of samples 5, 11 and 16 differs. Overall, since the samples are so similar overall, slight deviations in the clustering of the distance matrix are not too surprising.
heatmap_of_distances(vsd_filtered, str_wrap("Sample-to-sample Euclidian distance based on all genes - vst", 40))
## NULL
heatmap_of_distances(rld_filtered[rowData(rld_filtered)$species == "pk", ], str_wrap("Sample-to-sample Euclidian distance based on *P. knowlesi* genes - rlog before subset", 40))
## NULL
heatmap_of_distances(vsd_filtered[rowData(vsd_filtered)$species == "pk", ], str_wrap("Sample-to-sample Euclidian distance based on P. knowlesi genes - vst before subset", 40))
## NULL
heatmap_of_distances(rld_pk_filtered, str_wrap("Sample-to-sample Euclidian distance based on *P. knowlesi* genes - rlog after subset", 40))
## NULL
heatmap_of_distances(vsd_pk_filtered, str_wrap("Sample-to-sample Euclidian distance based on *P. knowlesi* genes - vst after subset", 40))
## NULL
heatmap_of_distances(rld_filtered[rowData(rld_filtered)$species == "pk" & rowData(rld_filtered)$gene_biotype == "protein_coding", ], str_wrap("Sample-to-sample Euclidian distance based on *P. knowlesi** protein-coding genes", 40))
## NULL
heatmap_of_distances(vsd_filtered[rowData(vsd_filtered)$species == "pk" & rowData(vsd_filtered)$gene_biotype == "protein_coding", ], str_wrap("Sample-to-sample Euclidian distance based on *P. knowlesi* protein-coding genes", 40))
## NULL
heatmap_of_distances(rld_pk_filtered_coding, str_wrap("Sample-to-sample Euclidian distance based on *P. knowlesi* protein-coding genes - rlog after subset", 40))
## NULL
heatmap_of_distances(vsd_pk_filtered_coding, str_wrap("Sample-to-sample Euclidian distance based on *P. knowlesi* protein-coding genes - vst after subset", 40))
## NULL
heatmap_of_counts <- function(raw_data, transformed_data, top_type = "mean", top_n = 20, center = TRUE) {
# select n most highly expressed/variable genes
if (top_type == "mean") {
select <- order(rowMeans(counts(raw_data, normalized = FALSE)),
decreasing = TRUE
)[1:top_n]
} else if (top_type == "median") {
select <- order(rowMedians(counts(raw_data, normalized = FALSE)),
decreasing = TRUE
)[1:top_n]
} else if (top_type == "var") {
select <- head(order(rowVars(assay(transformed_data)), decreasing = TRUE), top_n)
}
# select transformed data and center if requested
mat <- assay(transformed_data)[select, ]
if (center == TRUE){
mat <- mat - rowMeans(mat)
}
# add grouping columns
anno <- as.data.frame(colData(transformed_data)[, c("kit_name", "filter_name")]) %>%
mutate(kit_name = as.factor(kit_name)) %>%
mutate(filter_name = as.factor(filter_name)) %>%
rename_with(~ str_to_title(str_replace(.x, "_", " ")))
# set names of samples
colnames(mat) <- (transformed_data)$sample_short
rownames(anno) <- (transformed_data)$sample_short
# Adapted from: https://slowkow.com/notes/pheatmap-tutorial/
mat_colors <- list(
`Kit Name` = hcl.colors(length(unique(anno$`Kit Name`)), palette = "plasma", alpha = 1),
`Filter Name` = palette.colors(length(unique(anno$`Filter Name`)), palette = "Okabe-Ito", alpha = 1)
)
names(mat_colors$`Kit Name`) <- unique(anno$`Kit Name`)
names(mat_colors$`Filter Name`) <- unique(anno$`Filter Name`)
# create heatmap
pheatmap(mat,
annotation_col = anno,
annotation_colors = mat_colors,
color = viridis(n = 256, alpha = 1, begin = 0, end = 1, option = "viridis"),
# cluster_rows = FALSE,
# cluster_cols = FALSE,
show_rownames = FALSE,
)
# create table of genes
gene_annotation %>% filter(gene_id %in% rownames(mat))
}
This plot clusters the top n=20 most highly expressed genes (mean) centred on the gene’s mean expression level across all samples using the VST data.
Several of the top expressed genes are globin and rRNA genes (and most reads are human).
heatmap_of_counts(gse_filtered, vsd_filtered, "mean", 20, TRUE)
This plot cluster the top n=20 most highly expressed genes (mean) centred on the gene’s mean expression level across all samples using the rlog transformed data.
Several of the top expressed genes are globin and rRNA genes (and most reads are human).
heatmap_of_counts(gse_filtered, rld_filtered, "mean", 20, TRUE)
This plot clusters the top n=20 most highly expressed genes (median) centred on the gene’s mean expression level across all samples using the VST data.
Several of the top expressed genes are globin and rRNA genes (and most reads are human).
heatmap_of_counts(gse_filtered, vsd_filtered, "median", 20, TRUE)
This plot cluster the top n=20 most highly expressed genes (median) centred on the gene’s mean expression level across all samples using the rlog transformed data.
Several of the top expressed genes are globin and rRNA genes (and most reads are human).
heatmap_of_counts(gse_filtered, rld_filtered, "median", 20, TRUE)
This plot clusters the top n=20 most variable genes centred on the gene’s mean expression level across all samples using the VST data.
Several of the top variable genes are human/Pk rRNA genes (and most reads are human).
heatmap_of_counts(gse_filtered, vsd_filtered, "var", 20, TRUE)
This plot cluster the top n=20 most variable genes centred on the gene’s mean expression level across all samples using the rlog transformed data.
Several of the top variable genes are human genes (but globin and rRNA are rare).
heatmap_of_counts(gse_filtered, rld_filtered, "var", 20, TRUE)
This plot clusters the top n=20 most highly expressed genes centred on the gene’s mean expression level across all samples using the VST data.
The choice of transformation only has a minor effect on this subset of top-expressed genes.
heatmap_of_counts(gse_filtered[rowData(gse_filtered)$species == "pk" & rowData(gse_filtered)$gene_biotype == "protein_coding"],
vsd_filtered[rowData(vsd_filtered)$species == "pk" & rowData(vsd_filtered)$gene_biotype == "protein_coding"],
"mean", 20, TRUE)
This plot cluster the top n=20 most highly expressed genes centred on the gene’s mean expression level across all samples using the rlog transformed data.
The choice of transformation only has a minor effect on this subset of top-expressed genes.
heatmap_of_counts(gse_filtered[rowData(gse_filtered)$species == "pk" & rowData(gse_filtered)$gene_biotype == "protein_coding"],
rld_filtered[rowData(rld_filtered)$species == "pk" & rowData(rld_filtered)$gene_biotype == "protein_coding"],
"mean", 20, TRUE)
This plot clusters the top n=20 most variable genes centred on the gene’s mean expression level across all samples using the VST data.
The choice of transformation only has a minor effect on this subset of top-variable genes.
heatmap_of_counts(gse_filtered[rowData(gse_filtered)$species == "pk" & rowData(gse_filtered)$gene_biotype == "protein_coding"],
vsd_filtered[rowData(vsd_filtered)$species == "pk" & rowData(vsd_filtered)$gene_biotype == "protein_coding"],
"var", 20, TRUE)
This plot cluster the top n=20 most variable genes centred on the gene’s mean expression level across all samples using the rlog transformed data.
The choice of transformation only has a minor effect on this subset of top-variable genes.
heatmap_of_counts(gse_filtered[rowData(gse_filtered)$species == "pk" & rowData(gse_filtered)$gene_biotype == "protein_coding"],
rld_filtered[rowData(rld_filtered)$species == "pk" & rowData(rld_filtered)$gene_biotype == "protein_coding"],
"var", 20, TRUE)
This plot cluster the top n=20 most highly expressed Pk protein coding genes (centred on the gene’s mean expression level) across all samples using the vst transformed data, without clustering. The data was transformed after subsetting.
heatmap_of_counts(gse_pk_filtered_coding,
vsd_pk_filtered_coding,
"mean", 20, TRUE)
This plot cluster the top n=20 most highly expressed Pk protein coding genes (centred on the gene’s mean expression level) across all samples using the rlog transformed data, without clustering. The data was transformed after subsetting.
heatmap_of_counts(gse_pk_filtered_coding,
rld_pk_filtered_coding,
"mean", 20, TRUE)
This plot cluster the top n=20 most highly variable Pk protein coding genes across all samples using the vst transformed data, without clustering. The data was transformed after subsetting.
heatmap_of_counts(gse_pk_filtered_coding,
vsd_pk_filtered_coding,
"var", 20, TRUE)
This plot cluster the top n=20 most highly variable Pk protein coding genes across all samples using the rlog transformed data, without clustering. The data was transformed after subsetting.
heatmap_of_counts(gse_pk_filtered_coding,
rld_pk_filtered_coding,
"var", 20, TRUE)
This plot cluster the top n=20 most highly expressed Pk genes (centred on the gene’s mean expression level) across all samples using the rlog transformed data, without clustering.
Several of the top expressed genes in the previous plot are rRNA genes, but not all of them.
heatmap_of_counts(gse_filtered[rowData(gse_filtered)$species == "pk"],
vsd_filtered[rowData(gse_filtered)$species == "pk"],
"mean", 20, TRUE)
This plot cluster the top n=20 most highly expressed Pk genes (centred on the gene’s mean expression level) across all samples using the rlog transformed data, without clustering.
heatmap_of_counts(gse_filtered[rowData(gse_filtered)$species == "pk"],
vsd_filtered[rowData(gse_filtered)$species == "pk"],
"mean", 20, TRUE)
This plot cluster the top n=20 most highly variable Pk genes across all samples using the rlog transformed data, without clustering.
heatmap_of_counts(gse_filtered[rowData(gse_filtered)$species == "pk"],
vsd_filtered[rowData(gse_filtered)$species == "pk"],
"var", 20, TRUE)
This plot cluster the top n=20 most highly variable Pk genes across all samples using the rlog transformed data, without clustering.
heatmap_of_counts(gse_filtered[rowData(gse_filtered)$species == "pk"],
vsd_filtered[rowData(gse_filtered)$species == "pk"],
"var", 20, TRUE)
This plot cluster the top n=20 most highly expressed Pk genes (centred on the gene’s mean expression level) across all samples using the rlog transformed data, without clustering. The data was transformed after subsetting.
Several of the top expressed genes in the previous plot are rRNA genes, but not all of them.
heatmap_of_counts(gse_pk_filtered,
vsd_pk_filtered,
"mean", 20, TRUE)
This plot cluster the top n=20 most highly expressed Pk genes (centred on the gene’s mean expression level) across all samples using the rlog transformed data, without clustering. The data was transformed after subsetting.
Several of the top expressed genes in the previous plot are rRNA genes, but not all of them.
heatmap_of_counts(gse_pk_filtered,
rld_pk_filtered,
"mean", 20, TRUE)
This plot cluster the top n=20 most highly variable Pk genes across all samples using the rlog transformed data, without clustering. The data was transformed after subsetting.
heatmap_of_counts(gse_pk_filtered,
vsd_pk_filtered,
"var", 20, TRUE)
This plot cluster the top n=20 most highly variable Pk genes across all samples using the rlog transformed data, without clustering. The data was transformed after subsetting.
heatmap_of_counts(gse_pk_filtered,
rld_pk_filtered,
"var", 20, TRUE)
The following figure plots the (log-transformed) TPM of each protein-coding P. knowlesi gene (filtered on a minimum count of 10 for at least one sample) for all pairwise combinations of samples.
mat <- round(
cor(
gene_tpm_filtered %>%
filter(species == "pk") %>%
filter(gene_biotype == "protein_coding") %>%
select(`104974-001-001_S1_L001`:`104974-001-016_S1_L001`) %>%
rename_with(~samples %>% pull(sample_name) %>% as.character(), everything())
),
1)
corr <- mat
ggcorrplot(corr,
type = "lower",
method = "circle",
hc.order = FALSE,
outline.color = "white",
p.mat = cor_pmat(mat),
lab = TRUE,
legend.title = "Pearson correlation of TPM values") +
scale_fill_gradientn(colors = viridis(256, option = 'D'), limits = c(-1,1),
name = "Pearson correlation")
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
correlation_plot <- ggpairs(
gene_tpm_filtered %>%
filter(species == "pk") %>%
filter(gene_biotype == "protein_coding"),
columns = 3:18,
columnLabels = samples$sample_short,
upper = list(continuous = wrap("cor", size = 5)),
) +
theme(
axis.text = element_text(size = 9),
axis.title = element_text(size = 9),
strip.text.x = element_text(size = 11),
strip.text.y = element_text(size = 10),
axis.text.x = element_text(size = 6),
axis.text.y = element_text(size = 9)
)
# scale_y_discrete()
correlation_plot
correlation_plot_log <- ggpairs(
gene_tpm_filtered %>%
filter(species == "pk") %>%
filter(gene_biotype == "protein_coding"),
columns = 3:18, # `104974-001-001_S1_L001`:`104974-001-002_S1_L001`,
lower = list(continuous = "smooth"),
upper = list(continuous = wrap("cor", size = 5)),
columnLabels = samples$sample_short
) +
theme(
axis.text = element_text(size = 9),
axis.title = element_text(size = 9),
strip.text.x = element_text(size = 11),
strip.text.y = element_text(size = 10),
axis.text.x = element_text(size = 6),
axis.text.y = element_text(size = 9)
) +
scale_x_continuous(trans = "log1p") +
scale_y_continuous(trans = "log1p")
correlation_plot_log
All PCA plots use the top 500 most variable genes and are filtered to only genes with a minimum count of 10 reads for at least one sample, unless otherwise stated.
pcaData <- plotPCA(rld_filtered, intgroup = c("kit_name", "filter_name"), returnData = TRUE)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color = kit_name, shape = filter_name)) +
geom_point(size = 4) +
xlab(paste0("PC1: ", percentVar[1], "% variance")) +
ylab(paste0("PC2: ", percentVar[2], "% variance")) +
labs(
title = expression(paste("PCA of rlog-transformed counts of human and ", italic("P. knowlesi"), " genes")),
color = "Library preparation kit", shape = "Filter method"
) +
coord_fixed() +
scale_color_viridis(discrete = TRUE) +
theme_bw()
pcaData <- plotPCA(vsd_filtered, intgroup = c("kit_name", "filter_name"), returnData = TRUE)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color = kit_name, shape = filter_name)) +
geom_point(size = 4) +
xlab(paste0("PC1: ", percentVar[1], "% variance")) +
ylab(paste0("PC2: ", percentVar[2], "% variance")) +
labs(
title = expression(paste("PCA of VST counts of human and ", italic("P. knowlesi"), " genes")),
color = "Library preparation kit", shape = "Filter method"
) +
coord_fixed() +
scale_color_viridis(discrete = TRUE) +
theme_bw()
pcaData <- plotPCA(rld_filtered[rowData(rld_filtered)$species == "pk", ], intgroup = c("kit_name", "filter_name"), returnData = TRUE)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color = kit_name, shape = filter_name)) +
geom_point(size = 4) +
xlab(paste0("PC1: ", percentVar[1], "% variance")) +
ylab(paste0("PC2: ", percentVar[2], "% variance")) +
labs(
title = expression(paste("PCA of rlog-transformed counts of ", italic("P. knowlesi"), " genes")),
color = "Library preparation kit", shape = "Depletion method"
) +
coord_fixed() +
scale_color_viridis(discrete = TRUE) +
theme_bw()
pcaData <- plotPCA(vsd_filtered[rowData(vsd_filtered)$species == "pk", ], intgroup = c("kit_name", "filter_name"), returnData = TRUE)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color = kit_name, shape = filter_name)) +
geom_point(size = 4) +
xlab(paste0("PC1: ", percentVar[1], "% variance")) +
ylab(paste0("PC2: ", percentVar[2], "% variance")) +
labs(
title = expression(paste("PCA of VST counts of ", italic("P. knowlesi"), " genes")),
color = "Library preparation kit", shape = "Depletion method"
) +
coord_fixed() +
scale_color_viridis(discrete = TRUE) +
theme_bw()
pcaData <- plotPCA(rld_filtered[rowData(rld_filtered)$species == "pk" & rowData(rld_filtered)$gene_biotype == "protein_coding"], intgroup = c("kit_name", "filter_name"), returnData = TRUE)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color = kit_name, shape = filter_name)) +
geom_point(size = 4) +
xlab(paste0("PC1: ", percentVar[1], "% variance")) +
ylab(paste0("PC2: ", percentVar[2], "% variance")) +
labs(
title = expression(paste("PCA of rlog-transformed counts of ", italic("P. knowlesi"), " genes")),
color = "Library preparation kit", shape = "Depletion method"
) +
coord_fixed() +
scale_color_viridis(discrete = TRUE) +
theme_bw()
pcaData <- plotPCA(vsd_filtered[rowData(vsd_filtered)$species == "pk" & rowData(vsd_filtered)$gene_biotype == "protein_coding"], intgroup = c("kit_name", "filter_name"), returnData = TRUE)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color = kit_name, shape = filter_name)) +
geom_point(size = 4) +
xlab(paste0("PC1: ", percentVar[1], "% variance")) +
ylab(paste0("PC2: ", percentVar[2], "% variance")) +
labs(
title = expression(paste("PCA of VST counts of ", italic("P. knowlesi"), " genes")),
color = "Library preparation kit", shape = "Depletion method"
) +
coord_fixed() +
scale_color_viridis(discrete = TRUE) +
theme_bw()
pcaData <- plotPCA(rld_pk_filtered, intgroup = c("kit_name", "filter_name"), returnData = TRUE)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color = kit_name, shape = filter_name)) +
geom_point(size = 4) +
xlab(paste0("PC1: ", percentVar[1], "% variance")) +
ylab(paste0("PC2: ", percentVar[2], "% variance")) +
labs(
title = expression(paste("PCA of rlog-transformed counts of ", italic("P. knowlesi"), " genes")),
color = "Library preparation kit", shape = "Depletion method"
) +
coord_fixed() +
scale_color_viridis(discrete = TRUE) +
theme_bw()
pcaData <- plotPCA(vsd_pk_filtered, intgroup = c("kit_name", "filter_name"), returnData = TRUE)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color = kit_name, shape = filter_name)) +
geom_point(size = 4) +
xlab(paste0("PC1: ", percentVar[1], "% variance")) +
ylab(paste0("PC2: ", percentVar[2], "% variance")) +
labs(
title = expression(paste("PCA of VST counts of ", italic("P. knowlesi"), " genes")),
color = "Library preparation kit", shape = "Depletion method"
) +
coord_fixed() +
scale_color_viridis(discrete = TRUE) +
theme_bw()
protein_gene_filter <- intersect(rownames(rowData(rld_filtered)), gene_annotation %>% filter(gene_biotype == "protein_coding" & species == "pk") %>% select(gene_id) %>% pull())
pcaData <- plotPCA(rld_filtered[protein_gene_filter, ], intgroup = c("kit_name", "filter_name"), returnData = TRUE, ntop = 500)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color = kit_name, shape = filter_name)) +
geom_point(size = 4) +
xlab(paste0("PC1: ", percentVar[1], "% variance")) +
ylab(paste0("PC2: ", percentVar[2], "% variance")) +
labs(
title = expression(paste("PCA of rlog-transformed counts of ", italic("P. knowlesi"), " genes")),
color = "Library preparation kit", shape = "Depletion method"
) +
coord_fixed() +
scale_color_viridis(discrete = TRUE) +
theme_bw()
protein_gene_filter <- intersect(rownames(rowData(vsd_filtered)), gene_annotation %>% filter(gene_biotype == "protein_coding" & species == "pk") %>% select(gene_id) %>% pull())
pcaData <- plotPCA(vsd_filtered[protein_gene_filter, ], intgroup = c("kit_name", "filter_name"), returnData = TRUE, ntop = 500)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color = kit_name, shape = filter_name)) +
geom_point(size = 4) +
xlab(paste0("PC1: ", percentVar[1], "% variance")) +
ylab(paste0("PC2: ", percentVar[2], "% variance")) +
labs(
title = expression(paste("PCA of VST counts of ", italic("P. knowlesi"), " genes")),
color = "Library preparation kit", shape = "Depletion method"
) +
coord_fixed() +
scale_color_viridis(discrete = TRUE) +
theme_bw()
pcaData <- plotPCA(rld_pk_filtered_coding, intgroup = c("kit_name", "filter_name"), returnData = TRUE, ntop = 500)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color = kit_name, shape = filter_name)) +
geom_point(size = 4) +
xlab(paste0("PC1: ", percentVar[1], "% variance")) +
ylab(paste0("PC2: ", percentVar[2], "% variance")) +
labs(
title = expression(paste("PCA of rlog counts of ", italic("P. knowlesi"), " protein coding genes")),
color = "Library preparation kit", shape = "Depletion method"
) +
coord_fixed() +
scale_color_viridis(discrete = TRUE) +
theme_bw()
pcaData <- plotPCA(vsd_pk_filtered_coding, intgroup = c("kit_name", "filter_name"), returnData = TRUE, ntop = 500)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color = kit_name, shape = filter_name)) +
geom_point(size = 4) +
xlab(paste0("PC1: ", percentVar[1], "% variance")) +
ylab(paste0("PC2: ", percentVar[2], "% variance")) +
labs(
title = expression(paste("PCA of VST counts of ", italic("P. knowlesi"), " protein coding genes")),
color = "Library preparation kit", shape = "Depletion method"
) +
coord_fixed() +
scale_color_viridis(discrete = TRUE) +
theme_bw()
samples_order <- samples %>%
arrange(factor(samples$kit, levels = c("mRNA_illumina", "mRNA_qiagen_FSglobH", "mRNA_qiagen_FSglob", "tRNA_qiagen_FSglobH")),
factor(samples$WBC_depletion, levels = c("nofilter", "centpip", "plasmo", "pmacs", "cellulose"))) %>%
select(sample_name)
p2_a <- gene_counts_txi_filtered_long %>%
left_join(samples, by = "sample") %>%
group_by(sample_name, species) %>%
summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
# arrange(factor(sample_name, levels = samples_order %>% pull())) %>%
mutate(sample_name = factor(sample_name, levels = samples_order %>% pull())) %>%
ggplot(aes(x = sum_reads, y = sample_name, fill = species)) +
geom_bar(position = "stack", stat = "identity") +
scale_fill_viridis(
option = "viridis", discrete = T,
# limits = rev,
# labels = c("human", "human globin", "human rRNA", expression(italic("P. knowlesi")), expression(italic("P. knowlesi rRNA")))
) +
scale_y_discrete(limits = rev) +
scale_x_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
labs(y = "Sample", x = "Sum of all gene counts", fill = "RNA type") +
theme_classic() +
ggtitle("A")
# print p2a to show y labels
p2_a
p2_a <- p2_a + theme(axis.title.y = element_blank(), axis.text.y = element_blank())
p2_b <- gene_tpm_filtered_long %>%
left_join(samples, by = "sample") %>%
group_by(sample_name, gene_type) %>%
summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
mutate(sample_name = factor(sample_name, levels = samples_order %>% pull())) %>%
ggplot(aes(x = sum_reads, y = sample_name, fill = gene_type)) +
geom_bar(position = "stack", stat = "identity") +
scale_fill_manual(
values = c(`human` = "#440154FF",
`human_globin` = "#3B528BFF",
`human_rRNA` = "#21908CFF",
`pk_rRNA` = "#5DC863FF",
`pk` = "#FDE725FF"),
labels = c("human", "human globin", "human rRNA", expression(italic("P. knowlesi")),
expression(italic("P. knowlesi rRNA")))
) +
# scale_fill_viridis(
# option = "viridis", discrete = T,
# # limits = rev,
# labels = c("human", "human globin", "human rRNA", expression(italic("P. knowlesi")),
# expression(italic("P. knowlesi rRNA")))
# ) +
scale_y_discrete(limits = rev) +
scale_x_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
labs(y = "Sample", x = "Sum of TPM counts", fill = "RNA type") +
theme_classic() +
theme(axis.title.y = element_blank(), axis.text.y = element_blank()) +
ggtitle("B")
p2_c <- gene_counts_txi_filtered_long %>%
left_join(samples, by = "sample") %>%
filter(species == "pk" & gene_biotype == "protein_coding") %>%
group_by(sample_name,) %>%
summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
mutate(sample_name = factor(sample_name, levels = samples_order %>% pull())) %>%
ggplot(aes(x = sum_reads, y = sample_name)) +
geom_bar(position = "stack", stat = "identity", fill = "#FDE725FF") +
# scale_fill_manual(viridis::viridis(5)[4:5]) +
# scale_fill_manual(
# values = c(pk = "#5DC863FF", pk_rRNA = "#FDE725FF"),
# # scale_fill_viridis(
# # option = "viridis", discrete = T,
# # # limits = rev,
# labels = c(expression(italic("P. knowlesi")), expression(italic("P. knowlesi rRNA")))
# ) +
scale_y_discrete(limits = rev) +
scale_x_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
labs(y = "Sample", x = str_wrap("Sum of *P. knowlesi* protein-coding gene counts", 10), fill = "RNA type") +
theme_classic() +
theme(axis.title.y = element_blank(), axis.text.y = element_blank(), axis.title.x = ggtext::element_markdown()) +
ggtitle("C")
fig <- ggarrange(
p2_a, p2_b, p2_c,
ncol = 3,
legend = "bottom",
common.legend = TRUE,
legend.grob = get_legend(p2_b)
# widths = c(2,1,2)
)
fig
ggsave(filename = here("results-refseq/figures/gene_counts.svg"),
plot = fig, width=10, height=6)
p2_a <- gene_counts_txi_filtered_long %>%
left_join(samples, by = "sample") %>%
group_by(sample_name, gene_type) %>%
summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
ggplot(aes(y = sum_reads, x = sample_name, fill = gene_type)) +
geom_bar(position = "stack", stat = "identity") +
scale_fill_viridis(
option = "viridis", discrete = T,
# limits = rev,
labels = c("human", "human globin", "human rRNA", expression(italic("P. knowlesi")), expression(italic("P. knowlesi rRNA")))
) +
coord_flip() +
scale_x_discrete(limits = rev) +
scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
labs(x = "Sample", y = "Sum of gene counts", fill = "RNA type") +
theme_classic() +
ggtitle("A")
# print p2a to show y labels
p2_a
p2_a <- p2_a + theme(axis.title.y = element_blank(), axis.text.y = element_blank())
p2_b <- gene_tpm_filtered_long %>%
left_join(samples, by = "sample") %>%
group_by(sample_name, gene_type) %>%
summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
ggplot(aes(y = sum_reads, x = sample_name, fill = gene_type)) +
geom_bar(position = "stack", stat = "identity") +
scale_fill_viridis(
option = "viridis", discrete = T,
# limits = rev,
labels = c("human", "human globin", "human rRNA", expression(italic("P. knowlesi")), expression(italic("P. knowlesi rRNA")))
) +
coord_flip() +
scale_x_discrete(limits = rev) +
scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
labs(x = "Sample", y = "Sum of TPM counts", fill = "RNA type") +
theme_classic() +
theme(axis.title.y = element_blank(), axis.text.y = element_blank()) +
ggtitle("B")
p2_c <- gene_counts_txi_filtered_long %>%
left_join(samples, by = "sample") %>%
filter(species == "pk") %>%
group_by(sample_name, gene_type) %>%
summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
ggplot(aes(y = sum_reads, x = sample_name, fill = gene_type)) +
geom_bar(position = "stack", stat = "identity") +
# scale_fill_manual(viridis::viridis(5)[4:5]) +
scale_fill_manual(
values = c(pk = "#5DC863FF", pk_rRNA = "#FDE725FF"),
# scale_fill_viridis(
# option = "viridis", discrete = T,
# # limits = rev,
labels = c(expression(italic("P. knowlesi")), expression(italic("P. knowlesi rRNA")))
) +
coord_flip() +
scale_x_discrete(limits = rev) +
scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
labs(x = "Sample", y = expression(paste("Sum of ", italic("P. knowlesi"), " gene counts")), fill = "RNA type") +
theme_classic() +
theme(axis.title.y = element_blank(), axis.text.y = element_blank()) +
ggtitle("C")
fig <- ggarrange(
p2_a, p2_b, p2_c,
ncol = 3,
common.legend = TRUE, legend = "bottom"
)
fig
samples_order <- samples %>%
arrange(factor(samples$kit, levels = c("mRNA_illumina", "mRNA_qiagen_FSglobH", "mRNA_qiagen_FSglob", "tRNA_qiagen_FSglobH")),
factor(samples$WBC_depletion, levels = c("nofilter", "centpip", "plasmo", "pmacs", "cellulose"))) %>%
select(sample_name)
p2_a <- gene_counts_txi_filtered_long %>%
left_join(samples, by = "sample") %>%
group_by(sample_name, gene_type) %>%
summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
# arrange(factor(sample_name, levels = samples_order %>% pull())) %>%
mutate(sample_name = factor(sample_name, levels = samples_order %>% pull())) %>%
ggplot(aes(x = sum_reads, y = sample_name, fill = gene_type)) +
geom_bar(position = "stack", stat = "identity") +
scale_fill_viridis(
option = "viridis", discrete = T,
# limits = rev,
labels = c("human", "human globin", "human rRNA", expression(italic("P. knowlesi")), expression(italic("P. knowlesi rRNA")))
) +
scale_y_discrete(limits = rev) +
scale_x_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
labs(y = "Sample", x = "Sum of gene counts", fill = "RNA type") +
theme_classic() +
ggtitle("A")
# print p2a to show y labels
p2_a
p2_a <- p2_a + theme(axis.title.y = element_blank(), axis.text.y = element_blank())
p2_b <- gene_tpm_filtered_long %>%
left_join(samples, by = "sample") %>%
group_by(sample_name, gene_type) %>%
summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
mutate(sample_name = factor(sample_name, levels = samples_order %>% pull())) %>%
ggplot(aes(x = sum_reads, y = sample_name, fill = gene_type)) +
geom_bar(position = "stack", stat = "identity") +
scale_fill_viridis(
option = "viridis", discrete = T,
# limits = rev,
labels = c("human", "human globin", "human rRNA", expression(italic("P. knowlesi")), expression(italic("P. knowlesi rRNA")))
) +
scale_y_discrete(limits = rev) +
scale_x_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
labs(y = "Sample", x = "Sum of TPM counts", fill = "RNA type") +
theme_classic() +
theme(axis.title.y = element_blank(), axis.text.y = element_blank()) +
ggtitle("B")
p2_c <- gene_counts_txi_filtered_long %>%
left_join(samples, by = "sample") %>%
filter(species == "pk") %>%
group_by(sample_name, gene_type) %>%
summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
mutate(sample_name = factor(sample_name, levels = samples_order %>% pull())) %>%
ggplot(aes(x = sum_reads, y = sample_name, fill = gene_type)) +
geom_bar(position = "stack", stat = "identity") +
# scale_fill_manual(viridis::viridis(5)[4:5]) +
scale_fill_manual(
values = c(pk = "#5DC863FF", pk_rRNA = "#FDE725FF"),
# scale_fill_viridis(
# option = "viridis", discrete = T,
# # limits = rev,
labels = c(expression(italic("P. knowlesi")), expression(italic("P. knowlesi rRNA")))
) +
scale_y_discrete(limits = rev) +
scale_x_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
labs(y = "Sample", x = expression(paste("Sum of ", italic("P. knowlesi"), " gene counts")), fill = "RNA type") +
theme_classic() +
theme(axis.title.y = element_blank(), axis.text.y = element_blank()) +
ggtitle("C")
fig <- ggarrange(
p2_a, p2_b, p2_c,
ncol = 3,
common.legend = TRUE, legend = "bottom"
)
fig
heatmap_of_distances <- function(transformed_data, title) {
sampleDists <- dist(t(assay(transformed_data)))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- transformed_data$sample_short
colnames(sampleDistMatrix) <- NULL
# add grouping columns
anno <- as.data.frame(colData(transformed_data)[, c("kit_name", "filter_name")]) %>%
mutate(kit_name = as.factor(kit_name)) %>%
mutate(filter_name = as.factor(filter_name)) %>%
rename_with(~ str_to_title(str_replace(.x, "_", " ")))
# set names of samples
colnames(sampleDistMatrix) <- (transformed_data)$sample_short
rownames(anno) <- (transformed_data)$sample_short
# Adapted from: https://slowkow.com/notes/pheatmap-tutorial/
mat_colors <- list(
`Kit Name` = hcl.colors(length(unique(anno$`Kit Name`)), palette = "plasma", alpha = 1),
`Filter Name` = palette.colors(length(unique(anno$`Filter Name`)), palette = "Okabe-Ito", alpha = 1)
)
names(mat_colors$`Kit Name`) <- unique(anno$`Kit Name`)
names(mat_colors$`Filter Name`) <- unique(anno$`Filter Name`)
# create heatmap
p <- pheatmap(sampleDistMatrix,
annotation_col = anno,
annotation_colors = mat_colors,
clustering_distance_rows = sampleDists,
clustering_distance_cols = sampleDists,
color = viridis(
n = 256, alpha = 1,
begin = 0, end = 1, option = "viridis"
),
# main = "Sample-to-sample Euclidian distance based on all genes",
# main = title,
scale = "none",
show_rownames = TRUE,
show_colnames=FALSE
)
return(p)
}
p <- heatmap_of_distances(rld_filtered[rowData(rld_filtered)$species == "pk" & rowData(rld_filtered)$gene_biotype == "protein_coding", ], str_wrap("Sample-to-sample Euclidian distance based on *P. knowlesi** protein-coding genes", 40))
ggsave(filename = here("results-refseq/figures/genes_pk_coding_sample_dist_rlog.svg"),
plot = p)
## Saving 7 x 5 in image
p
p <- heatmap_of_distances(rld_pk_filtered_coding, str_wrap("Sample-to-sample Euclidian distance based on *P. knowlesi* protein-coding genes - rlog after subset", 40))
ggsave(filename = here("results-refseq/figures/genes_pk_coding_sample_dist_rlog_transform_after_subset.svg"),
plot = p)
## Saving 7 x 5 in image
p
heatmap_of_counts <- function(raw_data, transformed_data, top_type = "mean", top_n = 20, center = TRUE) {
# select n most highly expressed/variable genes
if (top_type == "mean") {
select <- order(rowMeans(counts(raw_data, normalized = FALSE)),
decreasing = TRUE
)[1:top_n]
} else if (top_type == "median") {
select <- order(rowMedians(counts(raw_data, normalized = FALSE)),
decreasing = TRUE
)[1:top_n]
} else if (top_type == "var") {
select <- head(order(rowVars(assay(transformed_data)), decreasing = TRUE), top_n)
}
# select transformed data and center if requested
mat <- assay(transformed_data)[select, ]
if (center == TRUE){
mat <- mat - rowMeans(mat)
}
# add grouping columns
anno <- as.data.frame(colData(transformed_data)[, c("kit_name", "filter_name")]) %>%
mutate(kit_name = as.factor(kit_name)) %>%
mutate(filter_name = as.factor(filter_name)) %>%
rename_with(~ str_to_title(str_replace(.x, "_", " ")))
# set names of samples
colnames(mat) <- (transformed_data)$sample_short
rownames(anno) <- (transformed_data)$sample_short
# Adapted from: https://slowkow.com/notes/pheatmap-tutorial/
mat_colors <- list(
`Kit Name` = hcl.colors(length(unique(anno$`Kit Name`)), palette = "plasma", alpha = 1),
`Filter Name` = palette.colors(length(unique(anno$`Filter Name`)), palette = "Okabe-Ito", alpha = 1)
)
names(mat_colors$`Kit Name`) <- unique(anno$`Kit Name`)
names(mat_colors$`Filter Name`) <- unique(anno$`Filter Name`)
# create heatmap
pheatmap(mat,
annotation_col = anno,
annotation_colors = mat_colors,
color = viridis(n = 256, alpha = 1, begin = 0, end = 1, option = "viridis"),
# cluster_rows = FALSE,
# cluster_cols = FALSE,
show_rownames = FALSE,
)
}
# p <- heatmap_of_counts(gse_filtered[rowData(gse_filtered)$species == "pk" & rowData(gse_filtered)$gene_biotype == "protein_coding"],
# rld_filtered[rowData(rld_filtered)$species == "pk" & rowData(rld_filtered)$gene_biotype == "protein_coding"],
# "mean", 20, TRUE)
# ggsave(filename = here("results-refseq/figures/genes_pk_coding_top_exp_rlog.svg"),
# plot = p, width=10, height=8)
p <- heatmap_of_counts(gse_filtered[rowData(gse_filtered)$species == "pk" & rowData(gse_filtered)$gene_biotype == "protein_coding"],
rld_filtered[rowData(rld_filtered)$species == "pk" & rowData(rld_filtered)$gene_biotype == "protein_coding"],
"var", 20, TRUE)
ggsave(filename = here("results-refseq/figures/genes_pk_coding_top_var_rlog.svg"),
plot = p)
## Saving 7 x 5 in image
p
# p <- heatmap_of_counts(gse_pk_filtered_coding,
# rld_pk_filtered_coding,
# "mean", 20, TRUE)
# ggsave(filename = here("results-refseq/figures/genes_pk_coding_top_exp_rlog_transform_after_subset.svg"),
# plot = p, width=10, height=8)
p <- heatmap_of_counts(gse_pk_filtered_coding,
rld_pk_filtered_coding,
"var", 20, TRUE)
ggsave(filename = here("results-refseq/figures/genes_pk_coding_top_var_rlog_transform_after_subset.svg"),
plot = p)
## Saving 7 x 5 in image
p
# p <- heatmap_of_counts(gse_filtered[rowData(gse_filtered)$species == "pk"],
# vsd_filtered[rowData(gse_filtered)$species == "pk"],
# "mean", 20, TRUE)
# ggsave(filename = here("results-refseq/figures/genes_pk_all_top_exp_rlog.svg"),
# plot = p, width=10, height=8)
p <- heatmap_of_counts(gse_filtered[rowData(gse_filtered)$species == "pk"],
vsd_filtered[rowData(gse_filtered)$species == "pk"],
"var", 20, TRUE)
ggsave(filename = here("results-refseq/figures/genes_pk_all_top_var_rlog.svg"),
plot = p)
## Saving 7 x 5 in image
p
# p <- heatmap_of_counts(gse_pk_filtered,
# rld_pk_filtered,
# "mean", 20, TRUE)
# ggsave(filename = here("results-refseq/figures/genes_pk_all_top_exp_rlog_transform_after_subset.svg"),
# plot = p, width=10, height=8)
p <- heatmap_of_counts(gse_pk_filtered,
rld_pk_filtered,
"var", 20, TRUE)
ggsave(filename = here("results-refseq/figures/genes_pk_all_top_var_rlog_transform_after_subset.svg"),
plot = p)
## Saving 7 x 5 in image
p
pcaData <- plotPCA(rld_pk_filtered_coding, intgroup = c("kit_name", "filter_name"), returnData = TRUE, ntop = 500)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
p <- ggplot(pcaData, aes(PC1, PC2, color = kit_name, shape = filter_name)) +
geom_point(size = 4) +
xlab(paste0("PC1: ", percentVar[1], "% variance")) +
ylab(paste0("PC2: ", percentVar[2], "% variance")) +
labs(
title = expression(paste("PCA of rlog counts of ", italic("P. knowlesi"), " protein coding genes")),
color = "Library preparation kit", shape = "Depletion method"
) +
coord_fixed() +
scale_color_viridis(discrete = TRUE) +
theme_bw()
ggsave(filename = here("results-refseq/figures/pca_pk_coding_rlog_transform_after_subset.svg"),
plot = p)
## Saving 7 x 5 in image
p
mat <- round(
cor(
gene_tpm_filtered %>%
filter(species == "pk") %>%
filter(gene_biotype == "protein_coding") %>%
select(`104974-001-001_S1_L001`:`104974-001-016_S1_L001`) %>%
rename_with(~samples %>% pull(sample_name) %>% as.character(), everything())
),
1)
corr <- mat
p <- ggcorrplot(corr,
type = "lower",
method = "circle",
hc.order = FALSE,
outline.color = "white",
p.mat = cor_pmat(mat),
lab = TRUE,
legend.title = "Pearson correlation of TPM values") +
scale_fill_gradientn(colors = viridis(256, option = 'D'), limits = c(-1,1), name = "Pearson correlation")
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
ggsave(filename = here("results-refseq/figures/pk_coding_tpm_correlation.svg"),
plot = p,
width = 10, height = 10)
p
# ordered by mRNA - tRNA
mat <- round(
cor(
gene_tpm_filtered %>%
filter(species == "pk") %>%
filter(gene_biotype == "protein_coding") %>%
select(`104974-001-001_S1_L001`:`104974-001-016_S1_L001`) %>%
rename_with(~samples %>% pull(sample_name) %>% as.character(), everything()) %>%
relocate(samples_order %>% pull())
),
1)
corr <- mat
p <- ggcorrplot(corr,
type = "lower",
method = "circle",
hc.order = FALSE,
outline.color = "white",
p.mat = cor_pmat(mat),
lab = TRUE,
legend.title = "Pearson correlation of TPM values") +
scale_fill_gradientn(colors = viridis(256, option = 'D'), limits = c(-1,1), name = "Pearson correlation")
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
ggsave(filename = here("results-refseq/figures/pk_coding_tpm_correlation_ordered.svg"),
plot = p,
width = 10, height = 10)
p
correlation_plot
ggsave(filename = here("results-refseq/figures/correlation_protein_coding_genes_pk.png"),
plot = correlation_plot,
height = 4000, width = 6000, units = "px")
ggsave(filename = here("results-refseq/figures/correlation_protein_coding_genes_pk.svg"),
plot = correlation_plot,
height = 4000, width = 6000, units = "px")
# order by tRNA - mRNA
samples_order <- samples %>%
arrange(factor(samples$kit,
levels = c("mRNA_illumina", "mRNA_qiagen_FSglobH", "mRNA_qiagen_FSglob", "tRNA_qiagen_FSglobH")),
factor(samples$WBC_depletion,
levels = c("nofilter", "centpip", "plasmo", "pmacs", "cellulose")))
correlation_plot_ordered <- ggpairs(
gene_tpm_filtered %>%
filter(species == "pk") %>%
filter(gene_biotype == "protein_coding") %>%
relocate(
samples_order %>% select(sample) %>% pull()
),
columns = 1:16,
# columnLabels = samples_order %>% select(sample_short) %>% pull(),
columnLabels = samples_order %>% select(sample_short) %>% mutate(sample_short = factor(sample_short, levels = sample_short)) %>% pull(),
upper = list(continuous = wrap("cor", size = 5)),
) +
theme(
axis.text = element_text(size = 9),
axis.title = element_text(size = 9),
strip.text.x = element_text(size = 11),
strip.text.y = element_text(size = 10),
axis.text.x = element_text(size = 6),
axis.text.y = element_text(size = 9)
)
# scale_y_discrete()
correlation_plot_ordered
ggsave(filename = here("results-refseq/figures/correlation_protein_coding_genes_pk_ordered.png"),
plot = correlation_plot_ordered,
height = 4000, width = 6000, units = "px")
ggsave(filename = here("results-refseq/figures/correlation_protein_coding_genes_pk_ordered.svg"),
plot = correlation_plot_ordered,
height = 4000, width = 6000, units = "px")
correlation_plot_log
ggsave(here("results-refseq/figures/correlation_protein_coding_genes_logscaled.png"),
plot = correlation_plot_log,
height = 4000, width = 6000, units = "px")
ggsave(here("results-refseq/figures/correlation_protein_coding_genes_logscaled.svg"),
plot = correlation_plot_log,
height = 4000, width = 6000, units = "px")
# order by tRNA-mRNA
correlation_plot_log_ordered <- ggpairs(
gene_tpm_filtered %>%
filter(species == "pk") %>%
filter(gene_biotype == "protein_coding") %>%
relocate(
samples_order %>% select(sample) %>% pull()
),
columns = 1:16, # `104974-001-001_S1_L001`:`104974-001-002_S1_L001`,
lower = list(continuous = "smooth"),
upper = list(continuous = wrap("cor", size = 5)),
columnLabels = samples_order %>% select(sample_short) %>% mutate(sample_short = factor(sample_short, levels = sample_short)) %>% pull(),,
) +
theme(
axis.text = element_text(size = 9),
axis.title = element_text(size = 9),
strip.text.x = element_text(size = 11),
strip.text.y = element_text(size = 10),
axis.text.x = element_text(size = 6),
axis.text.y = element_text(size = 9)
) +
scale_x_continuous(trans = "log1p") +
scale_y_continuous(trans = "log1p")
correlation_plot_log
ggsave(here("results-refseq/figures/correlation_protein_coding_genes_logscaled_ordered.png"),
plot = correlation_plot_log_ordered,
height = 4000, width = 6000, units = "px")
ggsave(here("results-refseq/figures/correlation_protein_coding_genes_logscaled_ordered.svg"),
plot = correlation_plot_log_ordered,
height = 4000, width = 6000, units = "px")
Number of Pk genes in transcript_info file:
dim(transcript_info$gene %>% filter(grepl("PKNH", gene_id)))
## [1] 5393 2
Number of Pk genes in count matrices:
gene_tpm %>% filter(species == "pk")
Number of Pk genes in gff:
read_delim(here("data/ref-refseq-refseq/GCF_000006355.2_GCA_000006355.2_genomic.gff"), comment = "#", col_names = c("seqname", "source", "feature", "start", "end", "score", "strand", "frame", "attribute")) %>%
mutate(id = row_number()) %>%
separate_rows(attribute, sep = ";") %>%
separate(attribute, c("attribute", "attribute_value"), sep = "=") %>%
pivot_wider(names_from = attribute, values_from = attribute_value) %>%
filter(!is.na(gene_biotype))
## Rows: 40620 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (7): seqname, source, feature, score, strand, frame, attribute
## dbl (2): start, end
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
read_delim(here("data/ref-refseq-refseq/GCF_000006355.2_GCA_000006355.2_genomic.gff"), comment = "#", col_names = c("seqname", "source", "feature", "start", "end", "score", "strand", "frame", "attribute")) %>%
mutate(id = row_number()) %>%
separate_rows(attribute, sep = ";") %>%
separate(attribute, c("attribute", "attribute_value"), sep = "=") %>%
pivot_wider(names_from = attribute, values_from = attribute_value) %>%
filter(feature == "gene")
## Rows: 40620 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (7): seqname, source, feature, score, strand, frame, attribute
## dbl (2): start, end
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
read_delim(here("data/ref-refseq-refseq/GCF_000006355.2_GCA_000006355.2_genomic.gff"), comment = "#", col_names = c("seqname", "source", "feature", "start", "end", "score", "strand", "frame", "attribute")) %>%
mutate(id = row_number()) %>%
separate_rows(attribute, sep = ";") %>%
separate(attribute, c("attribute", "attribute_value"), sep = "=") %>%
pivot_wider(names_from = attribute, values_from = attribute_value) %>%
filter(gene_biotype == "protein_coding")
## Rows: 40620 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (7): seqname, source, feature, score, strand, frame, attribute
## dbl (2): start, end
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Non-protein coding genes:
read_delim(here("data/ref-refseq-refseq/GCF_000006355.2_GCA_000006355.2_genomic.gff"), comment = "#", col_names = c("seqname", "source", "feature", "start", "end", "score", "strand", "frame", "attribute")) %>%
mutate(id = row_number()) %>%
separate_rows(attribute, sep = ";") %>%
separate(attribute, c("attribute", "attribute_value"), sep = "=") %>%
pivot_wider(names_from = attribute, values_from = attribute_value) %>%
filter(!gene_biotype == "protein_coding")
## Rows: 40620 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (7): seqname, source, feature, score, strand, frame, attribute
## dbl (2): start, end
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
read_delim(here("data/ref-refseq-refseq/GCF_000006355.2_GCA_000006355.2_genomic.gff"), comment = "#", col_names = c("seqname", "source", "feature", "start", "end", "score", "strand", "frame", "attribute")) %>%
mutate(id = row_number()) %>%
separate_rows(attribute, sep = ";") %>%
separate(attribute, c("attribute", "attribute_value"), sep = "=") %>%
pivot_wider(names_from = attribute, values_from = attribute_value) %>%
filter(gene_biotype == "rRNA")
## Rows: 40620 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (7): seqname, source, feature, score, strand, frame, attribute
## dbl (2): start, end
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.